-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[llvm][opt][Transforms][SPIR-V] Enable InferAddressSpaces
for SPIR-V
#110897
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-clang Author: Alex Voicu (AlexVlx) ChangesAlbeit not currently enabled, the InferAddressSpaces pass is desirable / profitable for SPIR-V, as it can leverage info that might subsequently be lost as transforms are applied to the IR/resulting SPIR-V. This patch enables the pass for all SPIR-V targets, and is modelled after the AMDGPU implementation. Patch is 93.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110897.diff 20 Files Affected:
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index b295bbbdaaf955..15c8b46d278ea1 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -58,13 +58,11 @@
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
-// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
-// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4
+// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4
+// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel1Pi(
@@ -126,13 +124,11 @@ __global__ void kernel1(int *x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
-// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
-// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4
+// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4
+// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel2Ri(
@@ -195,7 +191,7 @@ __global__ void kernel2(int &x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
// OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
@@ -261,7 +257,7 @@ __global__ void kernel3(__attribute__((address_space(2))) int *x,
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_func void @_Z4funcPi(
-// OPT-SPIRV-SAME: ptr addrspace(4) nocapture noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(4) nocapture noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[X]], align 4
// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
@@ -343,7 +339,7 @@ struct S {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1
@@ -446,19 +442,17 @@ __global__ void kernel4(struct S s) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[TMP1]], align 8
-// OPT-SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1
-// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP2]], align 4
-// OPT-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 8
-// OPT-SPIRV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8
-// OPT-SPIRV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[TMP4]], align 4
-// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP5]], 1.000000e+00
-// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP4]], align 4
+// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(1) [[S_COERCE]], align 8
+// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
+// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4
+// OPT-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[S_COERCE]], i64 8
+// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(1) [[Y]], align 8
+// OPT-SPIRV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[TMP2]], align 4
+// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00
+// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP2]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel5P1S(
@@ -551,7 +545,7 @@ struct T {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0
// OPT-SPIRV-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0
@@ -631,13 +625,11 @@ __global__ void kernel6(struct T t) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noalias nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
-// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
-// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4
+// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4
+// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel7Pi(
@@ -700,7 +692,7 @@ struct SS {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index 326343ae278148..0ae292498e4636 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -52,6 +52,8 @@ add_llvm_target(SPIRVCodeGen
Core
Demangle
GlobalISel
+ Passes
+ Scalar
SPIRVAnalysis
MC
SPIRVDesc
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index e5384b2eb2c2c1..91bcd68813fc55 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -26,9 +26,15 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Utils.h"
#include <optional>
@@ -91,6 +97,89 @@ SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT,
setRequiresStructuredCFG(false);
}
+namespace {
+ enum AddressSpace {
+ Function = storageClassToAddressSpace(SPIRV::StorageClass::Function),
+ CrossWorkgroup =
+ storageClassToAddressSpace(SPIRV::StorageClass::CrossWorkgroup),
+ UniformConstant =
+ storageClassToAddressSpace(SPIRV::StorageClass::UniformConstant),
+ Workgroup = storageClassToAddressSpace(SPIRV::StorageClass::Workgroup),
+ Generic = storageClassToAddressSpace(SPIRV::StorageClass::Generic)
+ };
+}
+
+unsigned SPIRVTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ const auto *LD = dyn_cast<LoadInst>(V);
+ if (!LD)
+ return UINT32_MAX;
+
+ // It must be a load from a pointer to Generic.
+ assert(V->getType()->isPointerTy() &&
+ V->getType()->getPointerAddressSpace() == AddressSpace::Generic);
+
+ const auto *Ptr = LD->getPointerOperand();
+ if (Ptr->getType()->getPointerAddressSpace() != AddressSpace::UniformConstant)
+ return UINT32_MAX;
+ // For a loaded from a pointer to UniformConstant, we can infer CrossWorkgroup
+ // storage, as this could only have been legally initialised with a
+ // CrossWorkgroup (aka device) constant pointer.
+ return AddressSpace::CrossWorkgroup;
+}
+
+std::pair<const Value *, unsigned>
+SPIRVTargetMachine::getPredicatedAddrSpace(const Value *V) const {
+ using namespace PatternMatch;
+
+ if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_is_shared:
+ return std::pair(II->getArgOperand(0), AddressSpace::Workgroup);
+ case Intrinsic::amdgcn_is_private:
+ return std::pair(II->getArgOperand(0), AddressSpace::Function);
+ default:
+ break;
+ }
+ return std::pair(nullptr, UINT32_MAX);
+ }
+ // Check the global pointer predication based on
+ // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
+ // the order of 'is_shared' and 'is_private' is not significant.
+ Value *Ptr;
+ if (getTargetTriple().getVendor() == Triple::VendorType::AMD &&
+ match(
+ const_cast<Value *>(V),
+ m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
+ m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(m_Deferred(Ptr))))))
+ return std::pair(Ptr, AddressSpace::CrossWorkgroup);
+
+ return std::pair(nullptr, UINT32_MAX);
+}
+
+bool SPIRVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ if (SrcAS != AddressSpace::Generic && SrcAS != AddressSpace::CrossWorkgroup)
+ return false;
+ return DestAS == AddressSpace::Generic ||
+ DestAS == AddressSpace::CrossWorkgroup;
+}
+
+void SPIRVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+ PB.registerCGSCCOptimizerLateEPCallback([](CGSCCPassManager &PM,
+ OptimizationLevel Level) {
+ if (Level == OptimizationLevel::O0)
+ return;
+
+ FunctionPassManager FPM;
+
+ // Add infer address spaces pass to the opt pipeline after inlining
+ // but before SROA to increase SROA opportunities.
+ FPM.addPass(InferAddressSpacesPass(AddressSpace::Generic));
+
+ PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+ });
+}
+
namespace {
// SPIR-V Code Generator Pass Configuration Options.
class SPIRVPassConfig : public TargetPassConfig {
@@ -178,6 +267,9 @@ void SPIRVPassConfig::addIRPasses() {
addPass(createSPIRVStructurizerPass());
}
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(createInferAddressSpacesPass(AddressSpace::Generic));
+
addPass(createSPIRVRegularizerPass());
addPass(createSPIRVPrepareFunctionsPass(TM));
addPass(createSPIRVStripConvergenceIntrinsicsPass());
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
index a1a9f26846153b..24b09febb9d184 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.h
@@ -43,6 +43,13 @@ class SPIRVTargetMachine : public LLVMTargetMachine {
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+
+ unsigned getAssumedAddrSpace(const Value *V) const override;
+ std::pair<const Value *, unsigned>
+ getPredicatedAddrSpace(const Value *V) const override;
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DstAS) const override;
+
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 24047f31fab290..295c0ceeade839 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -39,6 +39,10 @@ class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
+ unsigned getFlatAddressSpace() const {
+ return storageClassToAddressSpace(SPIRV::StorageClass::Generic);
+ }
+
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
// SPIR-V natively supports OpBitcount, per 3.53.14 in the spec, as such it
// is reasonable to assume the Op is fast / preferable to the expanded loop.
diff --git a/llvm/test/Transforms/InferAddressSpaces/SPIRV/assumed-addrspace.ll b/llvm/test/Transforms/InferAddressSpaces/SPIRV/assumed-addrspace.ll
new file mode 100644
index 00000000000000..9b65ff44f288f2
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/SPIRV/assumed-addrspace.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -mtriple=spirv32-- -passes=infer-address-spaces -o - %s | FileCheck %s
+; RUN: opt -S -mtriple=spirv64-- -passes=infer-address-spaces -o - %s | FileCheck %s
+
+@c0 = addrspace(2) global ptr undef
+
+; CHECK-LABEL: @generic_ptr_from_constant
+; CHECK: addrspacecast ptr addrspace(4) %p to ptr addrspace(1)
+; CHECK-NEXT: load float, ptr addrspace(1)
+define spir_func float @generic_ptr_from_constant() {
+ %p = load ptr addrspace(4), ptr addrspace(2) @c0
+ %v = load float, ptr addrspace(4) %p
+ ret float %v
+}
+
+%struct.S = type { ptr addrspace(4), ptr addrspace(4) }
+
+; CHECK-LABEL: @generic_ptr_from_aggregate_argument
+; CHECK: addrspacecast ptr addrspace(4) %p0 to ptr addrspace(1)
+; CHECK: addrspacecast ptr addrspace(4) %p1 to ptr addrspace(1)
+; CHECK: load i32, ptr addrspace(1)
+; CHECK: store float %v1, ptr addrspace(1)
+; CHECK: ret
+define spir_kernel void @generic_ptr_from_aggregate_argument(ptr addrspace(2) byval(%struct.S) align 8 %0) {
+ %p0 = load ptr addrspace(4), ptr addrspace(2) %0
+ %f1 = getelementptr inbounds %struct.S, ptr addrspace(2) %0, i64 0, i32 1
+ %p1 = load ptr addrspace(4), ptr addrspace(2) %f1
+ %v0 = load i32, ptr addrspace(4) %p0
+ %v1 = sitofp i32 %v0 to float
+ store float %v1, ptr addrspace(4) %p1
+ ret void
+}
diff --git a/llvm/test/Transforms/InferAddressSpaces/SPIRV/basic.ll b/llvm/test/Transforms/InferAddressSpaces/SPIRV/basic.ll
new file mode 100644
index 00000000000000..75b23aa30349af
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/SPIRV/basic.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=spirv32-- -passes=infer-address-spaces %s | FileCheck %s
+; RUN: opt -S -mtriple=spirv64-- -passes=infer-address-spaces %s | FileCheck %s
+
+; Trivial optimization of generic addressing
+
+define float @load_global_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: define float @load_global_from_flat(
+; CHECK-SAME: ptr addrspace(4) [[GENERIC_SCALAR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[GENERIC_SCALAR]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[TMP0]], align 4
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %tmp0 = addrspacecast ptr addrspace(4) %generic_scalar to ptr addrspace(1)
+ %tmp1 = load float, ptr addrspace(1) %tmp0
+ ret float %tmp1
+}
+
+define float @load_group_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: define float @load_group_from_flat(
+; CHECK-SAME: ptr addrspace(4) [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[GENERIC_SCALAR]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[TMP0]], align 4
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %tmp0 = addrspacecast ptr addrspace(4) %generic_scalar to ptr addrspace(3)
+ %tmp1 = load float, ptr addrspace(3) %tmp0
+ ret float %tmp1
+}
+
+define float @load_private_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: define float @load_private_from_flat(
+; CHECK-SAME: ptr addrspace(4) [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[GENERIC_SCALAR]] to ptr
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+; CHECK-NEXT: ret float [[TMP1]]
+;
+ %tmp0 = addrspacecast ptr addrspace(4) %generic_scalar to ptr
+ %tmp1 = load float, ptr %tmp0
+ ret float %tmp1
+}
+
+define spir_kernel void @store_global_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: define spir_kernel void @store_global_from_flat(
+; CHECK-SAME: ptr addrspace(4) [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[GENERIC_SCALAR]] to ptr addrspace(1)
+; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) [[TMP0]], align 4
+; CHECK-NEXT: ret void
+;
+ %tmp0 = addrspacecast ptr addrspace(4) %generic_scalar to ptr addrspace(1)
+ store float 0.0, ptr addrspace(1) %tmp0
+ ret void
+}
+
+define spir_kernel void @store_group_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: define spir_kernel void @store_group_from_flat(
+; CHECK-SAME: ptr addrspace(4) [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[GENERIC_SCALAR]] to ptr addrspace(3)
+; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) [[TMP0]], align 4
+; CHECK-NEXT: ret void
+;
+ %tmp0 = addrspacecast ptr addrspace(4) %generic_scalar to ptr addrspace(3)
+ store float 0.0, ptr addrspace(3) %tmp0
+ ret void
+}
+
+define spir_kernel void @store_private_from_flat(ptr addrspace(4) %generic_scalar) #0 {
+; CHECK-LABEL: de...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
unsigned SPIRVTargetMachine::getAssumedAddrSpace(const Value *V) const { | ||
const auto *LD = dyn_cast<LoadInst>(V); | ||
if (!LD) | ||
return UINT32_MAX; | ||
|
||
// It must be a load from a pointer to Generic. | ||
assert(V->getType()->isPointerTy() && | ||
V->getType()->getPointerAddressSpace() == AddressSpace::Generic); | ||
|
||
const auto *Ptr = LD->getPointerOperand(); | ||
if (Ptr->getType()->getPointerAddressSpace() != AddressSpace::UniformConstant) | ||
return UINT32_MAX; | ||
// For a loaded from a pointer to UniformConstant, we can infer CrossWorkgroup | ||
// storage, as this could only have been legally initialised with a | ||
// CrossWorkgroup (aka device) constant pointer. | ||
return AddressSpace::CrossWorkgroup; | ||
} | ||
|
||
std::pair<const Value *, unsigned> | ||
SPIRVTargetMachine::getPredicatedAddrSpace(const Value *V) const { | ||
using namespace PatternMatch; | ||
|
||
if (auto *II = dyn_cast<IntrinsicInst>(V)) { | ||
switch (II->getIntrinsicID()) { | ||
case Intrinsic::amdgcn_is_shared: | ||
return std::pair(II->getArgOperand(0), AddressSpace::Workgroup); | ||
case Intrinsic::amdgcn_is_private: | ||
return std::pair(II->getArgOperand(0), AddressSpace::Function); | ||
default: | ||
break; | ||
} | ||
return std::pair(nullptr, UINT32_MAX); | ||
} | ||
// Check the global pointer predication based on | ||
// (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and | ||
// the order of 'is_shared' and 'is_private' is not significant. | ||
Value *Ptr; | ||
if (getTargetTriple().getVendor() == Triple::VendorType::AMD && | ||
match( | ||
const_cast<Value *>(V), | ||
m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), | ||
m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( | ||
m_Deferred(Ptr)))))) | ||
return std::pair(Ptr, AddressSpace::CrossWorkgroup); | ||
|
||
return std::pair(nullptr, UINT32_MAX); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the fancy stuff that should go into a follow up patch to add assume support
m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), | ||
m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( | ||
m_Deferred(Ptr)))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't be looking at the amdgcn intrinsics? Surely spirv has its own operations for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's AMDGCN flavoured SPIR-V, which'd possibly have these in source; I don't think there's AS predicates in SPIR-V, at least not AFAICS in Clang/LLVM/the spec - happy to add them if they exist, but we'll need both.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I have skimmed SPIRV correctly, it expects invalid addrspacecasts (OpGenericCastToPtrExplicit) to return null. You could implement the same kind of check by looking for icmp ne (addrspacecast x to y), null
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Neither the BE nor the Translator handle that at the moment, and I suspect it's meant for implementing some specific bit of OpenCL (SYCL?) functionality. We use the non-explicit flavours, and those don't return null (and are diagnosed as illegal if they are illegal per spec). This is probably a good way of implementing the predicates / handling this, so thank you for it. Having said that, I reiterate that we have AMDGCN flavoured SPIR-V where the actual AMDGCN predicates would manifest / make sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could do the same thing for amdgpu. We implement addrspacecast with the same operations.
This also reminds me, we should have a valid flag on addrspacecast.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh just seeing this comment @AlexVlx
I think that we just need to implement the AS predicates (is_local / is_private & friends) atop
OpGenericPtrMemSemantics
is that for AMDGCN or something more general ? If the latter, the spec doesn't offer enough guarantee to do that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking about generic (general) predicates for SPIR-V. AFAICS the spec says this about OpGenericPtrMemSemantics
: Result is a valid [Memory Semantics](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Memory_Semantics_-id-) which includes mask bits set for the Storage Class for the specific (non-Generic) Storage Class of Pointer
. My interpretation (which could be wrong) is that the bits returned in the mask actually indicate the pointee's AS, so the generic predicates would lower to (handwavium alert) OpGenericPtrMemSemantics
+ bitwise AND.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My interpretation (which could be wrong) is that the bits returned in the mask actually indicate the pointee's AS, so the generic predicates would lower to (handwavium alert) OpGenericPtrMemSemantics + bitwise AND.
The returned value is guaranteed to be a valid combination for the AS but an impl can use the same combination for different AS.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I’m not quite sure how to parse this, apologies - what is an implementation in this case? It would be rather odd to have a valid implementation use e.g. setting the WorkGroup bit to denote CrossWorkGroup, would it not? Note I’m only considering SPIR-V, not what a target would decide to lower it to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is an implementation in this case?
a tool consuming the SPIR-V module like an opencl driver
It would be rather odd to have a valid implementation use e.g. setting the WorkGroup bit to denote CrossWorkGroup, would it not?
it is, but it may not make a difference for all platforms (e.g. CPUs don't typically have a dedicated workgroup memory) and checking what you are dealing can be somehow expensive or complex for no clear benefit down the line.
Generic = storageClassToAddressSpace(SPIRV::StorageClass::Generic) | ||
}; | ||
|
||
unsigned SPIRVTargetMachine::getAssumedAddrSpace(const Value *V) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to separate change, not sure this is necessarily valid for spirv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UniformConstant
is pretty much OCL constant
(with a bit of handwavium around initializers being allowed depending on an undefined client API). This is just saying that if you have a load from that, and you're loading a pointer, that pointer can only point to global (CrossWorkgroup), which I think holds here as well because there's no legal way to put a private or a local (shared) pointer in there (if you do it at static init, before a kernel executes, you cannot form those types of addresses, if you do it as the kernel executes it's UB). Or are you worried about cases where global does not include constant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the routine is ok for a vanilla OpenCL environment but extensions may make it invalid.
bool SPIRVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, | ||
unsigned DestAS) const { | ||
if (SrcAS != AddressSpace::Generic && SrcAS != AddressSpace::CrossWorkgroup) | ||
return false; | ||
return DestAS == AddressSpace::Generic || | ||
DestAS == AddressSpace::CrossWorkgroup; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is separate, I don't think InferAddressSpaces relies on this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does, please see isNoopPtrIntCastPair
in its implementation.
@@ -178,6 +266,9 @@ void SPIRVPassConfig::addIRPasses() { | |||
addPass(createSPIRVStructurizerPass()); | |||
} | |||
|
|||
if (TM.getOptLevel() > CodeGenOptLevel::None) | |||
addPass(createInferAddressSpacesPass(AddressSpace::Generic)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure why this is a pass parameter to InferAddressSpaces, and a TTI hook
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because if one invokes the pass directly via opt there's no way but the TTI query to set Flat/Generic to anything but 0, and because making it explicit at the point of construction rather than relying on that seems somewhat more self documenting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Out of curiosity, why do invoke this pass twice: in the middle-end and code-gen?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I remember we had some phase ordering issues where we needed to run this multiple times. I'm not sure what the current status is. We certainly need to run this after inlining
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should restrict this to just adding the basic pass, without the fancy assumed address space or assume handling. Leave those for later.
Also don't duplicate every test. These are mostly structural tests for the pass that do not should not be duplicated in every target. Just add a simple test with the basics to show the pass runs
Any particular reason for this, asides from the concern around constant / UniformConstant? I'll re-iterate that for AMDGCN flavoured SPIR-V it should do exactly what we do in AMDGPU, so punting in general seems counter-intuitive. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to separate change, not sure this is necessarily valid for spirv
I think that I'd prefer to keep this around, definitely for AMDGCNSPIRV where we know it is both correct and empirically beneficial. For vanilla SPIR-V I'll defer to folks on that side - I cannot think about cases where it'd be legal to put anything but a pointer to global (CrossWorkgroup) in constant (UniformConstant), but that might simply be ignorance on my part.
} | ||
|
||
std::pair<const Value *, unsigned> | ||
SPIRVTargetMachine::getPredicatedAddrSpace(const Value *V) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop this part from the patch, it's not tested and is questionable enough to do separately
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Generic = storageClassToAddressSpace(SPIRV::StorageClass::Generic) | ||
}; | ||
|
||
unsigned SPIRVTargetMachine::getAssumedAddrSpace(const Value *V) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the routine is ok for a vanilla OpenCL environment but extensions may make it invalid.
m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), | ||
m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( | ||
m_Deferred(Ptr)))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh just seeing this comment @AlexVlx
I think that we just need to implement the AS predicates (is_local / is_private & friends) atop
OpGenericPtrMemSemantics
is that for AMDGCN or something more general ? If the latter, the spec doesn't offer enough guarantee to do that.
Co-authored-by: Victor Lomuller <[email protected]>
// TODO: we only enable this for AMDGCN flavoured SPIR-V, where we know it to | ||
// be correct; this might be relaxed in the future. | ||
if (getTargetTriple().getVendor() != Triple::VendorType::AMD) | ||
return UINT32_MAX; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add AddressSpace::Unknown
/ AddressSpace::Invalid
instead of using magic constant or at least document the meaning of UINT32_MAX
?
@@ -178,6 +266,9 @@ void SPIRVPassConfig::addIRPasses() { | |||
addPass(createSPIRVStructurizerPass()); | |||
} | |||
|
|||
if (TM.getOptLevel() > CodeGenOptLevel::None) | |||
addPass(createInferAddressSpacesPass(AddressSpace::Generic)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Out of curiosity, why do invoke this pass twice: in the middle-end and code-gen?
unsigned SPIRVTargetMachine::getAssumedAddrSpace(const Value *V) const { | ||
// TODO: we only enable this for AMDGCN flavoured SPIR-V, where we know it to | ||
// be correct; this might be relaxed in the future. | ||
if (getTargetTriple().getVendor() != Triple::VendorType::AMD) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please move this whole hook to a separate PR. I also do not think we should have any vendor checks
|
||
bool SPIRVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, | ||
unsigned DestAS) const { | ||
if (getTargetTriple().getVendor() != Triple::VendorType::AMD) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should not have a vendor check
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' ba7e27381f1ce56b46839dca89e5d56ea170714e 2bc152a42dd43577db9bfdf96551f6f6166b83c8 llvm/test/Transforms/InferAddressSpaces/SPIRV/assumed-addrspace.ll llvm/test/Transforms/InferAddressSpaces/SPIRV/basic.ll llvm/test/Transforms/InferAddressSpaces/SPIRV/infer-address-space.ll llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp llvm/lib/Target/SPIRV/SPIRVTargetMachine.h llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
Albeit not currently enabled, the InferAddressSpaces pass is desirable / profitable for SPIR-V, as it can leverage info that might subsequently be lost as transforms are applied to the IR/resulting SPIR-V. This patch enables the pass for all SPIR-V targets, and is modelled after the AMDGPU implementation.