diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h index 6b577c02f0545..7f64688fba183 100644 --- a/llvm/include/llvm/Analysis/DXILResource.h +++ b/llvm/include/llvm/Analysis/DXILResource.h @@ -264,6 +264,8 @@ class ResourceInfo { class DXILResourceMap { SmallVector Resources; DenseMap CallMap; + // Mapping from Resource use to Resource Handle + DenseMap ResUseToHandleMap; unsigned FirstUAV = 0; unsigned FirstCBuffer = 0; unsigned FirstSampler = 0; @@ -335,6 +337,36 @@ class DXILResourceMap { } void print(raw_ostream &OS) const; + + void updateResourceMap(CallInst *origCallInst, CallInst *newCallInst); + + // Update ResUseMap with multiple new resource uses + void updateResUseMap(CallInst *origResUse, + std::vector &multiNewResUse); + + // Update ResUseMap with single new resource use + void updateResUseMap(CallInst *origResUse, CallInst *newResUse) { + assert((origResUse != nullptr) && (newResUse != nullptr) && + (origResUse != newResUse) && "Wrong Inputs"); + + updateResUseMapCommon(origResUse, newResUse, /*keepOrigResUseInMap=*/false); + } + + CallInst *findResHandleByUse(CallInst *resUse) { + auto Pos = ResUseToHandleMap.find(resUse); + assert((Pos != ResUseToHandleMap.end()) && + "Can't find the resource handle"); + + return Pos->second; + } + +private: + void updateResUseMapCommon(CallInst *origResUse, CallInst *newResUse, + bool keepOrigResUseInMap) { + ResUseToHandleMap.try_emplace(newResUse, findResHandleByUse(origResUse)); + if (!keepOrigResUseInMap) + ResUseToHandleMap.erase(origResUse); + } }; class DXILResourceAnalysis : public AnalysisInfoMixin { diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 48a9595f844f0..d8be8d002a091 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -27,6 +27,9 @@ def int_dx_handle_fromBinding [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem]>; +def int_dx_rawBufferLoad + : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty]>; + def int_dx_typedBufferLoad : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>; diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 2802480481690..be4fad8a84a17 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -719,6 +719,12 @@ DXILResourceMap::DXILResourceMap( if (Resources.empty() || RI != Resources.back()) Resources.push_back(RI); CallMap[CI] = Resources.size() - 1; + + // Build ResUseToHandleMap + for (auto it = CI->users().begin(); it != CI->users().end(); ++it) { + CallInst *CI_Use = dyn_cast(*it); + ResUseToHandleMap[CI_Use] = CI; + } } unsigned Size = Resources.size(); @@ -744,6 +750,61 @@ DXILResourceMap::DXILResourceMap( } } +// Parameter origCallInst: original Resource Handle +// Parameter newCallInst: new Resource Handle +// +// This function is needed when origCallInst's lowered to newCallInst. +// +// Because origCallInst and its uses will be replaced by newCallInst and new def +// instructions after lowering. The [origCallInst, resource info] entry in +// CallMap and [origCallInst's use, origCallInst] entries in ResUseToHandleMap +// have to be updated per the changes in lowering. +// +// What this function does are: +// 1. Add [newCallInst, resource info] entry in CallMap +// 2. Remove [origCallInst, resource info] entry in CallMap +// 3. Remap [origCallInst's use, origCallInst] entries to +// [origCallInst's use, newCallInst] entries in ResUseToHandleMap +// +// Remove those entries related to origCallInst in maps is necessary since +// origCallInst's no longer existing after lowering. Moreover, keeping those +// entries in maps will crash DXILResourceMap::print function +// +// FYI: +// Make sure to invoke this function before origCallInst->replaceAllUsesWith() +// and origCallInst->eraseFromParent() since this function needs to visit +// origCallInst and its uses. +// +void DXILResourceMap::updateResourceMap(CallInst *origCallInst, + CallInst *newCallInst) { + assert((origCallInst != nullptr) && (newCallInst != nullptr) && + (origCallInst != newCallInst)); + + CallMap.try_emplace(newCallInst, CallMap[origCallInst]); + CallMap.erase(origCallInst); + + // Update ResUseToHandleMap since Resource Handle changed + for (auto it = origCallInst->users().begin(); + it != origCallInst->users().end(); ++it) { + CallInst *CI_Use = dyn_cast(*it); + ResUseToHandleMap[CI_Use] = newCallInst; + } +} + + void DXILResourceMap::updateResUseMap(CallInst *origResUse, + std::vector &multiNewResUse) { + assert((origResUse != nullptr) && "Wrong Inputs"); + + for (int i = 0; i < multiNewResUse.size(); ++i) { + CallInst *newResUse = dyn_cast(multiNewResUse[i]); + assert(newResUse != nullptr); + + bool keepOrigResUseInMap = + i == (multiNewResUse.size() - 1) ? false : true; + updateResUseMapCommon(origResUse, newResUse, keepOrigResUseInMap); + } + } + void DXILResourceMap::print(raw_ostream &OS) const { for (unsigned I = 0, E = Resources.size(); I != E; ++I) { OS << "Binding " << I << ":\n"; @@ -756,6 +817,14 @@ void DXILResourceMap::print(raw_ostream &OS) const { CI->print(OS); OS << "\n"; } + + for (const auto &[ResUse, ResHandle] : ResUseToHandleMap) { + OS << "\n"; + OS << "Resource " << CallMap.find(ResHandle)->second; + OS << " is used by "; + ResUse->print(OS); + OS << "\n"; + } } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 1a8e110491cc8..a638371d6031a 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -854,6 +854,17 @@ def AnnotateHandle : DXILOp<216, annotateHandle> { let stages = [Stages]; } +def RawBufferLoad : DXILOp<139, rawBufferLoad> { + let Doc = "reads from a ByteAddressBuffer or StructuredBuffer"; + // Handle, Coord0, Coord1, mask, alignment + let arguments = [HandleTy, Int32Ty, Int32Ty, Int8Ty, Int32Ty]; + let result = OverloadTy; + let overloads = + [Overloads]; + let stages = [Stages]; +} + def CreateHandleFromBinding : DXILOp<217, createHandleFromBinding> { let Doc = "create resource handle from binding"; let arguments = [ResBindTy, Int32Ty, Int1Ty]; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 9f124394363a3..d2b6f05fc936b 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -249,6 +249,8 @@ class OpLowerer { removeResourceGlobals(CI); + DRM.updateResourceMap(CI, *OpCall); + CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); return Error::success(); @@ -295,6 +297,8 @@ class OpLowerer { removeResourceGlobals(CI); + DRM.updateResourceMap(CI, *OpBind); + CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); @@ -479,6 +483,9 @@ class OpLowerer { OpCode::BufferLoad, Args, CI->getName(), NewRetTy); if (Error E = OpCall.takeError()) return E; + + DRM.updateResUseMap(CI, *OpCall); + if (Error E = replaceResRetUses(CI, *OpCall, HasCheckBit)) return E; @@ -547,6 +554,8 @@ class OpLowerer { if (Error E = OpCall.takeError()) return E; + DRM.updateResUseMap(CI, *OpCall); + CI->eraseFromParent(); return Error::success(); }); @@ -619,6 +628,193 @@ class OpLowerer { }); } + Value *GenerateRawBufLd(Value *handle, Value *bufIdx, Value *offset, Type *Ty, + IRBuilder<> &Builder, unsigned NumComponents, + Constant *alignment) { + if (bufIdx == nullptr) { + // This is actually a byte address buffer load with a struct template + // type. The call takes only one coordinates for the offset. + bufIdx = offset; + offset = UndefValue::get(offset->getType()); + } + + // NumComponents 1: mask = 1 // Mask_X; + // NumComponents 2: mask = 3 // Mask_X | Mask_Y + // NumComponents 3: mask = 7 // Mask_X | Mask_Y | Mask_Z + // NumComponents 4: mask = 15 // Mask_X | Mask_Y | Mask_Z | Mask_W + assert((NumComponents) > 0 && (NumComponents < 5)); + Constant *mask = + ConstantInt::get(Builder.getInt8Ty(), ((1 << NumComponents) - 1)); + + Value *Args[] = {handle, bufIdx, offset, mask, alignment}; + Type *NewRetTy = OpBuilder.getResRetType(Ty->getScalarType()); + Expected OpCall = OpBuilder.tryCreateOp( + OpCode::RawBufferLoad, Args, "", NewRetTy); // TODO: Need name argument? + if (Error E = OpCall.takeError()) + return nullptr; + + return *OpCall; + } + + void TranslateRawBufVecLd(Type *Ty, unsigned ElemCount, IRBuilder<> &Builder, + Value *handle, Value *bufIdx, Value *baseOffset, + const DataLayout &DL, std::vector &bufLds, + unsigned baseAlign, bool isScalarTy) { + Type *VecEltTy = Ty->getScalarType(); + + unsigned EltSize = DL.getTypeAllocSize(VecEltTy); + unsigned alignment = std::min(baseAlign, EltSize); + Constant *alignmentVal = + ConstantInt::get(M.getContext(), APInt(32, alignment)); + + if (baseOffset == nullptr) { + baseOffset = ConstantInt::get(Builder.getInt32Ty(), 0); + } + + std::vector elts(ElemCount); + unsigned rest = (ElemCount % 4); + for (unsigned i = 0; i < ElemCount - rest; i += 4) { + Value *bufLd = GenerateRawBufLd(handle, bufIdx, baseOffset, Ty, Builder, + 4, alignmentVal); + bufLds.emplace_back(bufLd); + + baseOffset = Builder.CreateAdd( + baseOffset, ConstantInt::get(Builder.getInt32Ty(), 4 * EltSize)); + } + + if (rest) { + Value *bufLd = GenerateRawBufLd(handle, bufIdx, baseOffset, Ty, Builder, + rest, alignmentVal); + bufLds.emplace_back(bufLd); + } + } + + Error replaceMultiResRetsUses(CallInst *Intrin, + std::vector &bufLds) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + + // TODO: HasCheckBit???? + + Type *OldTy = Intrin->getType(); + + // For scalars, we just extract the first element. + if (!isa(OldTy)) { + CallInst *Op = dyn_cast(bufLds[0]); + assert(Op != nullptr); + Value *EVI = IRB.CreateExtractValue(Op, 0); + + Intrin->replaceAllUsesWith(EVI); + DRM.updateResUseMap(Intrin, Op); + Intrin->eraseFromParent(); + + return Error::success(); + } + + const auto *VecTy = cast(OldTy); + const unsigned N = VecTy->getNumElements(); + + std::vector Extracts(N); + + // The users of the operation should all be scalarized, so we attempt to + // replace the extractelements with extractvalues directly. + for (Use &U : make_early_inc_range(Intrin->uses())) { + if (auto *EEI = dyn_cast(U.getUser())) { + if (auto *IndexOp = dyn_cast(EEI->getIndexOperand())) { + size_t IndexVal = IndexOp->getZExtValue(); + assert(IndexVal < N && "Index into buffer load out of range"); + if (!Extracts[IndexVal]) { + CallInst *Op = dyn_cast(bufLds[IndexVal / 4]); + assert(Op != nullptr); + Extracts[IndexVal] = IRB.CreateExtractValue(Op, IndexVal % 4); + } + EEI->replaceAllUsesWith(Extracts[IndexVal]); + EEI->eraseFromParent(); + } else { + // Need to handle DynamicAccesses here??? + } + } + } + + // If there's a dynamic access we need to round trip through stack memory so + // that we don't leave vectors around. + // + // TODO: dynamic access for rawbuffer?????? + // + + // If we still have uses, then we're not fully scalarized and need to + // recreate the vector. This should only happen for things like exported + // functions from libraries. + if (!Intrin->use_empty()) { + for (int I = 0, E = N; I != E; ++I) + if (!Extracts[I]) { + CallInst *Op = dyn_cast(bufLds[I / 4]); + assert(Op != nullptr); + Extracts[I] = IRB.CreateExtractValue(Op, I % 4); + } + + Value *Vec = UndefValue::get(OldTy); + for (int I = 0, E = N; I != E; ++I) + Vec = IRB.CreateInsertElement(Vec, Extracts[I], I); + + Intrin->replaceAllUsesWith(Vec); + } + + // TODO: + // Remove the dx.op.rawbufferload without any uses now? + + DRM.updateResUseMap(Intrin, bufLds); + Intrin->eraseFromParent(); + + return Error::success(); + } + + [[nodiscard]] bool lowerRawBufferLoad(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + + return replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + + auto *It = DRM.find(DRM.findResHandleByUse(CI)); + assert(It != DRM.end() && "Resource not in map?"); + dxil::ResourceInfo &RI = *It; + + ResourceKind RCKind = RI.getResourceKind(); + assert((RCKind == dxil::ResourceKind::StructuredBuffer) || + (RCKind == dxil::ResourceKind::RawBuffer)); + + Type *Ty = CI->getType(); + std::vector bufLds; + // TODO: Need check Bool type load??? + + unsigned numComponents = 1; + if (Ty->isVectorTy()) { + numComponents = dyn_cast(Ty)->getNumElements(); + } + + Value *Handle = + createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); + Value *bufIdx = CI->getArgOperand(1); + Value *baseOffset = CI->getArgOperand(2); + + bool isScalarTy = !Ty->isVectorTy(); + + if (RCKind == dxil::ResourceKind::StructuredBuffer) { + TranslateRawBufVecLd(Ty, numComponents, IRB, Handle, bufIdx, baseOffset, + F.getDataLayout(), bufLds, + /*baseAlign (in bytes)*/ 8, isScalarTy); + } else { + TranslateRawBufVecLd(Ty, numComponents, IRB, Handle, bufIdx, baseOffset, + F.getDataLayout(), bufLds, + /*baseAlign (in bytes)*/ 4, isScalarTy); + } + + if (Error E = replaceMultiResRetsUses(CI, bufLds)) + return E; + + return Error::success(); + }); + } + bool lowerIntrinsics() { bool Updated = false; bool HasErrors = false; @@ -638,6 +834,9 @@ class OpLowerer { case Intrinsic::dx_handle_fromBinding: HasErrors |= lowerHandleFromBinding(F); break; + case Intrinsic::dx_rawBufferLoad: + HasErrors |= lowerRawBufferLoad(F); + break; case Intrinsic::dx_typedBufferLoad: HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false); break; diff --git a/llvm/test/Analysis/DXILResource/resource-map.ll b/llvm/test/Analysis/DXILResource/resource-map.ll new file mode 100644 index 0000000000000..65255d4c942e5 --- /dev/null +++ b/llvm/test/Analysis/DXILResource/resource-map.ll @@ -0,0 +1,36 @@ +; RUN: opt -S -disable-output -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +define void @test_typedbuffer() { + ; RWBuffer Buf : register(u5, space3) + %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK: Binding [[UAV1:[0-9]+]]: + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 0 + ; CHECK: Space: 3 + ; CHECK: Lower Bound: 5 + ; CHECK: Size: 1 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: f32 + ; CHECK: Element Count: 4 + + ; CHECK: Call bound to [[UAV1]]: %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK-DAG: Resource [[UAV1]] is used by %data0 = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 0) + ; CHECK-DAG: Resource [[UAV1]] is used by call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 2, <4 x float> %data0) + + %data0 = call <4 x float> @llvm.dx.typedBufferLoad( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 0) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, + i32 2, <4 x float> %data0) + + ret void +} + diff --git a/llvm/test/CodeGen/DirectX/DXILResource/dxil-resource-map.ll b/llvm/test/CodeGen/DirectX/DXILResource/dxil-resource-map.ll new file mode 100644 index 0000000000000..ac5f3d1614597 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/DXILResource/dxil-resource-map.ll @@ -0,0 +1,48 @@ +; RUN: opt -S -disable-output -disable-output -passes="print,dxil-op-lower,print" -mtriple=dxil-pc-shadermodel6.6-compute < %s 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK_SM66 +; RUN: opt -S -disable-output -disable-output -passes="print,dxil-op-lower,print" -mtriple=dxil-pc-shadermodel6.2-compute < %s 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK_SM62 + +define void @test_typedbuffer() { + ; RWBuffer Buf : register(u5, space3) + %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK: Binding [[UAV1:[0-9]+]]: + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 0 + ; CHECK: Space: 3 + ; CHECK: Lower Bound: 5 + ; CHECK: Size: 1 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: f32 + ; CHECK: Element Count: 4 + + ; CHECK: Call bound to [[UAV1]]: %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK-DAG: Resource [[UAV1]] is used by %data0 = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 0) + ; CHECK-DAG: Resource [[UAV1]] is used by call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 2, <4 x float> %data0) + + %data0 = call <4 x float> @llvm.dx.typedBufferLoad( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, i32 0) + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %uav1, + i32 2, <4 x float> %data0) + + ; + ;;; After dxil-op-lower, the DXILResourceMap info should be updated. + ; + ; CHECK_SM66: Call bound to [[UAV1]]: %uav11 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 5, i32 5, i32 3, i8 1 }, i32 0, i1 false) + ; CHECK_SM66-DAG: Resource [[UAV1]] is used by %data02 = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %uav1_annot, i32 0, i32 undef) + ; CHECK_SM66-DAG: Resource [[UAV1]] is used by call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %uav1_annot, i32 2, i32 undef, float %9, float %10, float %11, float %12, i8 15) + ; + ; CHECK_SM62: Call bound to [[UAV1]]: %uav11 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false) + ; CHECK_SM62-DAG: Resource [[UAV1]] is used by %data02 = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %uav11, i32 0, i32 undef) + ; CHECK_SM62-DAG: Resource [[UAV1]] is used by call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %uav11, i32 2, i32 undef, float %9, float %10, float %11, float %12, i8 15) + + ret void +} + diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll new file mode 100644 index 0000000000000..500fec544e36d --- /dev/null +++ b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll @@ -0,0 +1,192 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +declare void @scalar_user(float) +declare void @vector_user(<4 x float>) +declare void @check_user(i1) + +declare void @vector_user_v3f32x4(<12 x float>) + +;; StructureBuffer load + +define void @loadv4f32() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", <4 x float>, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v4f32_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; The temporary casts should all have been cleaned up + ; CHECK-NOT: %dx.cast_handle + + ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 0, i32 0, i8 15, i32 4) + %data0 = call <4 x float> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <4 x float>, 0, 0) %buffer, i32 0, i32 0) + + ; The extract order depends on the users, so don't enforce that here. + ; CHECK-DAG: [[VAL0_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0 + %data0_0 = extractelement <4 x float> %data0, i32 0 + ; CHECK-DAG: [[VAL0_2:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 2 + %data0_2 = extractelement <4 x float> %data0, i32 2 + + ; If all of the uses are extracts, we skip creating a vector + ; CHECK-NOT: insertelement + ; CHECK-DAG: call void @scalar_user(float [[VAL0_0]]) + ; CHECK-DAG: call void @scalar_user(float [[VAL0_2]]) + call void @scalar_user(float %data0_0) + call void @scalar_user(float %data0_2) + + ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 4, i32 0, i8 15, i32 4) + %data4 = call <4 x float> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <4 x float>, 0, 0) %buffer, i32 4, i32 0) + + ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0 + ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 1 + ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 2 + ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 3 + ; CHECK: insertelement <4 x float> undef + ; CHECK: insertelement <4 x float> + ; CHECK: insertelement <4 x float> + ; CHECK: insertelement <4 x float> + call void @vector_user(<4 x float> %data4) + + ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 12, i32 0, i8 15, i32 4) + %data12 = call <4 x float> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <4 x float>, 0, 0) %buffer, i32 12, i32 0) + + ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3 + %data12_3 = extractelement <4 x float> %data12, i32 3 + + ; If there are a mix of users we need the vector, but extracts are direct + ; CHECK: call void @scalar_user(float [[DATA12_3]]) + call void @scalar_user(float %data12_3) + call void @vector_user(<4 x float> %data12) + + ret void +} + +define void @loadv3f32x4() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", <12 x float>, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v3f32x4_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; The temporary casts should all have been cleaned up + ; CHECK-NOT: %dx.cast_handle + + ; CHECK: [[DATA0_3:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 0, i32 0, i8 15, i32 4) + ; CHECK: [[DATA4_7:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 0, i32 16, i8 15, i32 4) + ; CHECK: [[DATA8_11:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[HANDLE]], i32 0, i32 32, i8 15, i32 4) + %data0 = call <12 x float> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <12 x float>, 0, 0) %buffer, i32 0, i32 0) + + ; The extract order depends on the users, so don't enforce that here. + ; CHECK-DAG: [[VAL0_2:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0_3]], 2 + %data0_2 = extractelement <12 x float> %data0, i32 2 + ; CHECK-DAG: [[VAL0_7:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA4_7]], 3 + %data0_7 = extractelement <12 x float> %data0, i32 7 + + ; If all of the uses are extracts, we skip creating a vector + ; CHECK-NOT: insertelement + ; CHECK-DAG: call void @scalar_user(float [[VAL0_2]]) + ; CHECK-DAG: call void @scalar_user(float [[VAL0_7]]) + call void @scalar_user(float %data0_2) + call void @scalar_user(float %data0_7) + + ;; Vector Use + ; + ; CHECK: [[DATA3_0_3:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %buffer_annot, i32 3, i32 0, i8 15, i32 4) + ; CHECK: [[DATA3_4_7:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %buffer_annot, i32 3, i32 16, i8 15, i32 4) + ; CHECK: [[DATA3_8_11:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %buffer_annot, i32 3, i32 32, i8 15, i32 4) + ; CHECK: [[VAL3_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_0_3]], 0 + ; CHECK: [[VAL3_1:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_0_3]], 1 + ; CHECK: [[VAL3_2:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_0_3]], 2 + ; CHECK: [[VAL3_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_0_3]], 3 + ; CHECK: [[VAL3_4:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_4_7]], 0 + ; CHECK: [[VAL3_5:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_4_7]], 1 + ; CHECK: [[VAL3_6:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_4_7]], 2 + ; CHECK: [[VAL3_7:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_4_7]], 3 + ; CHECK: [[VAL3_8:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_8_11]], 0 + ; CHECK: [[VAL3_9:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_8_11]], 1 + ; CHECK: [[VAL3_10:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_8_11]], 2 + ; CHECK: [[VAL3_11:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA3_8_11]], 3 + ; CHECK: insertelement <12 x float> undef + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: insertelement <12 x float> + ; CHECK: [[VAL3_VecRes:%.*]] = insertelement <12 x float> + ; CHECK: call void @vector_user_v3f32x4(<12 x float> [[VAL3_VecRes]]) + %data3 = call <12 x float> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <12 x float>, 0, 0) %buffer, i32 3, i32 0) + call void @vector_user_v3f32x4(<12 x float> %data3); + + ret void +} + +define void @loadv4i32x2() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", <8 x i32>, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v4i32x2_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_3:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 0, i8 15, i32 4) + ; CHECK: [[DATA4_7:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 16, i8 15, i32 4) + %data0 = call <8 x i32> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <8 x i32>, 0, 0) %buffer, i32 0, i32 0) + + ret void +} + +define void @loadv4f16() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", <4 x half>, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v4f16_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 0, i8 15, i32 2) + %data0 = call <4 x half> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <4 x half>, 0, 0) %buffer, i32 0, i32 0) + + ret void +} + +define void @loadv2i16x3() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", <6 x i16>, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v2i16x3_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_3:%.*]] = call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 0, i8 15, i32 2) + ; CHECK: [[DATA4_5:%.*]] = call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 8, i8 3, i32 2) + %data0 = call <6 x i16> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", <6 x i16>, 0, 0) %buffer, i32 0, i32 0) + + ret void +} + +;; ByteAddressBuffer load +define void @load_2() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_v2i32_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_1:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %buffer_annot, i32 0, i32 0, i8 3, i32 4) + %data0 = call <2 x i32> @llvm.dx.rawBufferLoad( + target("dx.RawBuffer", i8, 0, 0) %buffer, i32 0, i32 0) + + ret void +}