diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll new file mode 100644 index 0000000000000..5c8fcfdca2385 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no aspects used +; TU0_kernel1 - 1st translation unit, aspect 1 is used +; TU1_kernel2 - 2nd translation unit, no aspects used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use aspects from kernels which doesn't use aspects regardless of device +; code split mode + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !sycl_used_aspects !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll new file mode 100644 index 0000000000000..ff9aa0f29376e --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll @@ -0,0 +1,59 @@ +; The test is intended to check that sycl-post-link correctly groups kernels +; by unique sets of aspects used in them + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel3 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel3 + +; CHECK-M1-SYMS: kernel0 + +; CHECK-M2-SYMS: kernel1 +; CHECK-M2-SYMS: kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !sycl_used_aspects !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !sycl_used_aspects !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !sycl_used_aspects !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !sycl_used_aspects !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 1} +!2 = !{i32 1, i32 2} +!3 = !{i32 2, i32 1} +!4 = !{i32 2, i32 3, i32 4} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll new file mode 100644 index 0000000000000..5fa587abca234 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll @@ -0,0 +1,94 @@ +; This test is intended to check that per-aspect device code split works as +; expected with SYCL_EXTERNAL functions + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel1 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel0 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not foo \ +; RUN: --implicit-check-not bar +; +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not bar + +; We expect to see 3 modules generated: +; +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; sycl-post-link aims to achieve two goals while doing splitting: +; - each kernel must be self-contained, i.e. all functions called from a +; kernel must reside in the same device image +; - each entry point should be assigned to a correct device image in +; accordance with selected device code split mode +; +; In this test @bar and @foo are SYCL_EXTERNAL functions and they are treated +; as entry points. +; +; @bar uses the same list of aspects as @kernel0 which calls it and therefore +; they can be put into the same device image. There also goes @baz, because of +; the same list of used aspects. +; +; CHECK-M0-SYMS: bar +; CHECK-M0-SYMS: baz +; CHECK-M0-SYMS: kernel0 +; +; List of aspects used by @foo is different from the one attached to @kernel1 +; which calls @foo (for example, @kernel1 uses an extra optional feature besides +; ones used in @foo). As a result, @foo should be both included into the same +; device image as @kernel1 to make it self contained, but at the same time it +; should also present in a separate device image, because it is an entry point +; with unique set of used aspects. +; +; CHECK-M1-SYMS: foo +; +; CHECK-M2-SYMS: kernel1 +; +; @kernel1 uses @foo and therefore @foo should be present in the same module as +; @kernel1 as well +; CHECK-M2-IR-DAG: define spir_func void @foo +; CHECK-M2-IR-DAG: define spir_kernel void @kernel1 + + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_func void @foo() #0 !sycl_used_aspects !1 { + ret void +} + +define spir_func void @bar() #1 !sycl_used_aspects !2 { + ret void +} + +define spir_func void @baz() #1 !sycl_used_aspects !2 { + ret void +} + +define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 { +entry: + call void @bar() + ret void +} + +define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 { +entry: + call void @foo() + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!1 = !{i32 1} +!2 = !{i32 2} +!3 = !{i32 3, i32 1} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll new file mode 100644 index 0000000000000..96972687a1e7f --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll @@ -0,0 +1,55 @@ +; This test is intended to check that we do not perform per-aspect split if +; it was disabled through one or another sycl-post-link option + +; RUN: sycl-post-link -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK-IR +; +; -lower-esimd is needed so sycl-post-link does not complain about no actions +; specified +; RUN: sycl-post-link -lower-esimd -ir-output-only -S %s -o %t.ll +; RUN: FileCheck %s -input-file=%t.ll --check-prefix CHECK-IR + +; We expect to see only one module generated: +; +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.ll +; CHECK-TABLE-EMPTY: + +; Regardless of used aspects and sycl-module-id metadata, all kernel and +; functions should still be present. + +; CHECK-IR-DAG: define spir_func void @foo +; CHECK-IR-DAG: define spir_func void @bar +; CHECK-IR-DAG: define spir_kernel void @kernel0 +; CHECK-IR-DAG: define spir_kernel void @kernel1 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_func void @foo() #0 !sycl_used_aspects !1 { + ret void +} + +define spir_func void @bar() #1 !sycl_used_aspects !2 { + ret void +} + +define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 { +entry: + ret void +} + +define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 { +entry: + call void @foo() + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!1 = !{i32 1} +!2 = !{i32 2} +!3 = !{i32 3, i32 1} + diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp index ada6c5007ed7d..1ce47af2f0b74 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp @@ -761,5 +761,143 @@ getLargeGRFSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints) { return std::make_unique(std::move(MD), std::move(Groups)); } +namespace { +// Data structure, which represent a combination of all possible optional +// features used in a function. +// +// It has extra methods to be useable as a key in llvm::DenseMap. +struct UsedOptionalFeatures { + SmallVector Aspects; + // TODO: extend this further with reqd-sub-group-size, reqd-work-group-size, + // large-grf and other properties + + UsedOptionalFeatures() = default; + + UsedOptionalFeatures(const Function *F) { + if (const MDNode *MDN = F->getMetadata("sycl_used_aspects")) { + auto ExtractIntegerFromMDNodeOperand = [=](const MDOperand &N) { + Constant *C = cast(N.get())->getValue(); + return C->getUniqueInteger().getSExtValue(); + }; + + // !sycl_used_aspects is supposed to contain unique values, no duplicates + // are expected here + llvm::transform(MDN->operands(), std::back_inserter(Aspects), + ExtractIntegerFromMDNodeOperand); + llvm::sort(Aspects); + } + + llvm::hash_code AspectsHash = + llvm::hash_combine_range(Aspects.begin(), Aspects.end()); + Hash = static_cast(llvm::hash_combine(AspectsHash)); + } + + std::string getName(StringRef BaseName) const { + if (Aspects.empty()) + return BaseName.str() + "-no-aspects"; + + std::string Ret = BaseName.str() + "-aspects"; + for (int A : Aspects) { + Ret += "-" + std::to_string(A); + } + return Ret; + } + + static UsedOptionalFeatures getTombstone() { + UsedOptionalFeatures Ret; + Ret.IsTombstoneKey = true; + return Ret; + } + + static UsedOptionalFeatures getEmpty() { + UsedOptionalFeatures Ret; + Ret.IsEmpty = true; + return Ret; + } + +private: + // For DenseMap: + llvm::hash_code Hash = {}; + bool IsTombstoneKey = false; + bool IsEmpty = false; + +public: + bool operator==(const UsedOptionalFeatures &Other) const { + // Tombstone does not compare equal to any other item + if (IsTombstoneKey || Other.IsTombstoneKey) + return false; + + if (Aspects.size() != Other.Aspects.size()) + return false; + + for (size_t I = 0, E = Aspects.size(); I != E; ++I) { + if (Aspects[I] != Other.Aspects[I]) + return false; + } + + return IsEmpty == Other.IsEmpty; + } + + unsigned hash() const { return static_cast(Hash); } +}; + +struct UsedOptionalFeaturesAsKeyInfo { + static inline UsedOptionalFeatures getEmptyKey() { + return UsedOptionalFeatures::getEmpty(); + } + + static inline UsedOptionalFeatures getTombstoneKey() { + return UsedOptionalFeatures::getTombstone(); + } + + static unsigned getHashValue(const UsedOptionalFeatures &Value) { + return Value.hash(); + } + + static bool isEqual(const UsedOptionalFeatures &LHS, + const UsedOptionalFeatures &RHS) { + return LHS == RHS; + } +}; +} // namespace + +std::unique_ptr +getSplitterByOptionalFeatures(ModuleDesc &&MD, + bool EmitOnlyKernelsAsEntryPoints) { + EntryPointGroupVec Groups; + + DenseMap + PropertiesToFunctionsMap; + + Module &M = MD.getModule(); + + // Only process module entry points: + for (auto &F : M.functions()) { + if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) || + !MD.isEntryPointCandidate(F)) { + continue; + } + + auto Key = UsedOptionalFeatures(&F); + PropertiesToFunctionsMap[std::move(Key)].insert(&F); + } + + if (PropertiesToFunctionsMap.empty()) { + // No entry points met, record this. + Groups.emplace_back(GLOBAL_SCOPE_NAME, EntryPointSet{}); + } else { + Groups.reserve(PropertiesToFunctionsMap.size()); + for (auto &EPG : PropertiesToFunctionsMap) { + Groups.emplace_back(EPG.first.getName(MD.getEntryPointGroup().GroupId), + std::move(EPG.second), MD.getEntryPointGroup().Props); + } + } + + if (Groups.size() > 1) + return std::make_unique(std::move(MD), std::move(Groups)); + else + return std::make_unique(std::move(MD), std::move(Groups)); +} + } // namespace module_split } // namespace llvm diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/tools/sycl-post-link/ModuleSplitter.h index 7088909c8400a..31a70d99c9dc5 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.h +++ b/llvm/tools/sycl-post-link/ModuleSplitter.h @@ -253,6 +253,10 @@ getSplitterByMode(ModuleDesc &&MD, IRSplitMode Mode, std::unique_ptr getLargeGRFSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints); +std::unique_ptr +getSplitterByOptionalFeatures(ModuleDesc &&MD, + bool EmitOnlyKernelsAsEntryPoints); + #ifndef NDEBUG void dumpEntryPoints(const EntryPointSet &C, const char *msg = "", int Tab = 0); void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false, diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 30ad2da61330c..eb65da70c7d1c 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -756,12 +756,43 @@ processInputModule(std::unique_ptr M) { module_split::getSplitterByMode(module_split::ModuleDesc{std::move(M)}, SplitMode, IROutputOnly, EmitOnlyKernelsAsEntryPoints); - const bool SplitByScope = ScopedSplitter->totalSplits() > 1; - Modified |= SplitByScope; + SmallVector TopLevelModules; + bool SplitByOptionalFeatures = false; + + // FIXME: this check should be performed on all split levels if (DeviceGlobals) ScopedSplitter->verifyNoCrossModuleDeviceGlobalUsage(); + while (ScopedSplitter->hasMoreSplits()) { + module_split::ModuleDesc MD = ScopedSplitter->nextSplit(); + + if (IROutputOnly || SplitMode == module_split::SPLIT_NONE) { + // We can't perform any kind of split. + TopLevelModules.emplace_back(std::move(MD)); + continue; + } + + std::unique_ptr OptionalFeaturesSplitter = + module_split::getSplitterByOptionalFeatures( + std::move(MD), EmitOnlyKernelsAsEntryPoints); + + // Here we perform second-level splitting based on device-specific + // features used/declared in entry points. + // This step is mandatory, because it is required for functional + // correctness, i.e. to prevent speculative compilation of kernels that use + // optional features on a HW which doesn't support them. + while (OptionalFeaturesSplitter->hasMoreSplits()) { + TopLevelModules.emplace_back(OptionalFeaturesSplitter->nextSplit()); + } + + SplitByOptionalFeatures |= OptionalFeaturesSplitter->totalSplits() > 1; + } + + const bool SplitByScope = ScopedSplitter->totalSplits() > 1; + Modified |= SplitByScope; + Modified |= SplitByOptionalFeatures; + // TODO this nested splitting scheme will not scale well when other split // "dimensions" will be added. Some infra/"split manager" needs to be // implemented in this case - e.g. all needed splitters are registered, then @@ -769,11 +800,12 @@ processInputModule(std::unique_ptr M) { // "leaf" ModuleDesc's resulted from splitting. Some bookkeeping is needed for // ESIMD splitter to link back needed modules. - // Proceed with top-level splitting. - while (ScopedSplitter->hasMoreSplits()) { - module_split::ModuleDesc MDesc = ScopedSplitter->nextSplit(); + // Based on results from the top-level splitting, we perform some lower-level + // splitting for various unique features. + for (module_split::ModuleDesc &MDesc : TopLevelModules) { DUMP_ENTRY_POINTS(MDesc.entries(), MDesc.Name.c_str(), 1); + // FIXME: large grf should be handled by properties splitter above std::unique_ptr LargeGRFSplitter = module_split::getLargeGRFSplitter(std::move(MDesc), EmitOnlyKernelsAsEntryPoints); @@ -848,7 +880,9 @@ processInputModule(std::unique_ptr M) { DUMP_ENTRY_POINTS(MMs.back().entries(), MMs.back().Name.c_str(), 3); Modified = true; } - bool SplitOccurred = SplitByScope || SplitByLargeGRF || SplitByESIMD; + + bool SplitOccurred = SplitByScope || SplitByLargeGRF || SplitByESIMD || + SplitByOptionalFeatures; if (IROutputOnly) { if (SplitOccurred) {