diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp
index e220fc52738d0..b328c5a4ac1a9 100644
--- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp
@@ -703,8 +703,22 @@ static void shareByValParams(Function &F, const Triple &TT) {
         spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
 
     // 3) replace argument with shadow in all uses
+    Value *RepVal = Shadow;
+    if (TT.isNVPTX()) {
+      // For NVPTX target address space inference for kernel arguments and
+      // allocas is happening in the backend (NVPTXLowerArgs and
+      // NVPTXLowerAlloca passes). After the frontend these pointers are in LLVM
+      // default address space 0 which is the generic address space for NVPTX
+      // target.
+      assert(Arg.getType()->getPointerAddressSpace() == 0);
+
+      // Cast a pointer in the shared address space to the generic address
+      // space.
+      RepVal =
+          ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, Arg.getType());
+    }
     for (auto *U : Arg.users())
-      U->replaceUsesOfWith(&Arg, Shadow);
+      U->replaceUsesOfWith(&Arg, RepVal);
 
     // 4) fill the shadow from the argument for the leader WI only
     LLVMContext &Ctx = At.getContext();
diff --git a/llvm/test/SYCLLowerIR/cast_shadow.ll b/llvm/test/SYCLLowerIR/cast_shadow.ll
new file mode 100644
index 0000000000000..1822f1c5e8148
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/cast_shadow.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -LowerWGScope -verify -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda-sycldevice"
+
+%struct.baz = type { i8 }
+%struct.spam = type { %struct.wobble, %struct.wobble, %struct.wobble, %struct.wombat.0 }
+%struct.wobble = type { %struct.wombat }
+%struct.wombat = type { [1 x i64] }
+%struct.wombat.0 = type { %struct.wombat }
+%struct.quux = type { i8 }
+
+; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.spam undef
+
+define internal void @wobble(%struct.baz* %arg, %struct.spam* byval(%struct.spam) %arg1) !work_group_scope !0 {
+; CHECK:    [[TMP10:%.*]] = bitcast %struct.spam* [[ARG1:%.*]] to i8*
+; CHECK:    call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* [[TMP10]], i64 32, i1 false)
+; CHECK:    call void @widget(%struct.spam* addrspacecast (%struct.spam addrspace(3)* @[[SHADOW]] to %struct.spam*), %struct.quux* byval(%struct.quux) [[TMP2:%.*]])
+;
+bb:
+  %tmp = alloca %struct.baz*
+  %tmp2 = alloca %struct.quux
+  store %struct.baz* %arg, %struct.baz** %tmp
+  %tmp3 = load %struct.baz*, %struct.baz** %tmp
+  call void @widget(%struct.spam* %arg1, %struct.quux* byval(%struct.quux) %tmp2)
+  ret void
+}
+
+define internal void @widget(%struct.spam* %arg, %struct.quux* byval(%struct.quux) %arg1) !work_item_scope !0 !parallel_for_work_item !0 {
+bb:
+  ret void
+}
+
+!0 = !{}