diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
new file mode 100644
index 0000000000000..5c8fcfdca2385
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
@@ -0,0 +1,133 @@
+; This test emulates two translation units with 3 kernels:
+; TU0_kernel0 - 1st translation unit, no aspects used
+; TU0_kernel1 - 1st translation unit, aspect 1 is used
+; TU1_kernel2 - 2nd translation unit, no aspects used
+
+; The test is intended to check that sycl-post-link correctly separates kernels
+; that use aspects from kernels which doesn't use aspects regardless of device
+; code split mode
+
+; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
+; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
+; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
+; Regardless of device code split mode, each kernel should go into a separate
+; device image
+
+; CHECK-M2-IR: define {{.*}} @TU0_kernel0
+; CHECK-M2-SYMS: TU0_kernel0
+
+; CHECK-M1-IR: define {{.*}} @TU0_kernel1
+; CHECK-M1-SYMS: TU0_kernel1
+
+; CHECK-M0-IR: define {{.*}} @TU1_kernel2
+; CHECK-M0-SYMS: TU1_kernel2
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux"
+
+; FIXME: device globals should also be properly distributed across device images
+; if they are of optional type
+@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
+
+define dso_local spir_kernel void @TU0_kernel0() #0 {
+entry:
+  call spir_func void @foo()
+  ret void
+}
+
+define dso_local spir_func void @foo() {
+entry:
+  %a = alloca i32, align 4
+  %call = call spir_func i32 @bar(i32 1)
+  %add = add nsw i32 2, %call
+  store i32 %add, i32* %a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) {
+entry:
+  %arg.addr = alloca i32, align 4
+  store i32 %arg, i32* %arg.addr, align 4
+  %0 = load i32, i32* %arg.addr, align 4
+  ret i32 %0
+}
+
+define dso_local spir_kernel void @TU0_kernel1() #0 !sycl_used_aspects !2 {
+entry:
+  call spir_func void @foo1()
+  ret void
+}
+
+; Function Attrs: nounwind
+define dso_local spir_func void @foo1() {
+entry:
+  %a = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  ret void
+}
+
+define dso_local spir_kernel void @TU1_kernel2() #1 {
+entry:
+  call spir_func void @foo2()
+  ret void
+}
+
+; Function Attrs: nounwind
+define dso_local spir_func void @foo2() {
+entry:
+  %a = alloca i32, align 4
+  %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
+  %add = add nsw i32 4, %0
+  store i32 %add, i32* %a, align 4
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+attributes #1 = { "sycl-module-id"="TU2.cpp" }
+
+!opencl.spir.version = !{!0, !0}
+!spirv.Source = !{!1, !1}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 4, i32 100000}
+!2 = !{i32 1}
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
new file mode 100644
index 0000000000000..ff9aa0f29376e
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
@@ -0,0 +1,59 @@
+; The test is intended to check that sycl-post-link correctly groups kernels
+; by unique sets of aspects used in them
+
+; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel3 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
+
+; CHECK-TABLE: Code
+; CHECK-TABLE-NEXT: _0.sym
+; CHECK-TABLE-NEXT: _1.sym
+; CHECK-TABLE-NEXT: _2.sym
+; CHECK-TABLE-EMPTY:
+
+; CHECK-M0-SYMS: kernel3
+
+; CHECK-M1-SYMS: kernel0
+
+; CHECK-M2-SYMS: kernel1
+; CHECK-M2-SYMS: kernel2
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux"
+
+define dso_local spir_kernel void @kernel0() #0 !sycl_used_aspects !1 {
+entry:
+  ret void
+}
+
+define dso_local spir_kernel void @kernel1() #0 !sycl_used_aspects !2 {
+entry:
+  ret void
+}
+
+define dso_local spir_kernel void @kernel2() #0 !sycl_used_aspects !3 {
+entry:
+  ret void
+}
+
+define dso_local spir_kernel void @kernel3() #0 !sycl_used_aspects !4 {
+entry:
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+
+!1 = !{i32 1}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 2, i32 1}
+!4 = !{i32 2, i32 3, i32 4}
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
new file mode 100644
index 0000000000000..5fa587abca234
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
@@ -0,0 +1,94 @@
+; This test is intended to check that per-aspect device code split works as
+; expected with SYCL_EXTERNAL functions
+
+; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel1
+;
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel0
+;
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not foo \
+; RUN:     --implicit-check-not bar
+;
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK-M2-IR \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not bar
+
+; We expect to see 3 modules generated:
+;
+; CHECK-TABLE: Code
+; CHECK-TABLE-NEXT: _0.sym
+; CHECK-TABLE-NEXT: _1.sym
+; CHECK-TABLE-NEXT: _2.sym
+; CHECK-TABLE-EMPTY:
+
+; sycl-post-link aims to achieve two goals while doing splitting:
+;   - each kernel must be self-contained, i.e. all functions called from a
+;     kernel must reside in the same device image
+;   - each entry point should be assigned to a correct device image in
+;     accordance with selected device code split mode
+;
+; In this test @bar and @foo are SYCL_EXTERNAL functions and they are treated
+; as entry points.
+;
+; @bar uses the same list of aspects as @kernel0 which calls it and therefore
+; they can be put into the same device image. There also goes @baz, because of
+; the same list of used aspects.
+;
+; CHECK-M0-SYMS: bar
+; CHECK-M0-SYMS: baz
+; CHECK-M0-SYMS: kernel0
+;
+; List of aspects used by @foo is different from the one attached to @kernel1
+; which calls @foo (for example, @kernel1 uses an extra optional feature besides
+; ones used in @foo). As a result, @foo should be both included into the same
+; device image as @kernel1 to make it self contained, but at the same time it
+; should also present in a separate device image, because it is an entry point
+; with unique set of used aspects.
+;
+; CHECK-M1-SYMS: foo
+;
+; CHECK-M2-SYMS: kernel1
+;
+; @kernel1 uses @foo and therefore @foo should be present in the same module as
+; @kernel1 as well
+; CHECK-M2-IR-DAG: define spir_func void @foo
+; CHECK-M2-IR-DAG: define spir_kernel void @kernel1
+
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux"
+
+define spir_func void @foo() #0 !sycl_used_aspects !1 {
+  ret void
+}
+
+define spir_func void @bar() #1 !sycl_used_aspects !2 {
+  ret void
+}
+
+define spir_func void @baz() #1 !sycl_used_aspects !2 {
+  ret void
+}
+
+define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 {
+entry:
+  call void @bar()
+  ret void
+}
+
+define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 {
+entry:
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+attributes #1 = { "sycl-module-id"="TU2.cpp" }
+
+!1 = !{i32 1}
+!2 = !{i32 2}
+!3 = !{i32 3, i32 1}
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll
new file mode 100644
index 0000000000000..96972687a1e7f
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-4.ll
@@ -0,0 +1,55 @@
+; This test is intended to check that we do not perform per-aspect split if
+; it was disabled through one or another sycl-post-link option
+
+; RUN: sycl-post-link -symbols -S %s -o %t.table
+; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK-IR
+;
+; -lower-esimd is needed so sycl-post-link does not complain about no actions
+; specified
+; RUN: sycl-post-link -lower-esimd -ir-output-only -S %s -o %t.ll
+; RUN: FileCheck %s -input-file=%t.ll --check-prefix CHECK-IR
+
+; We expect to see only one module generated:
+;
+; CHECK-TABLE: Code
+; CHECK-TABLE-NEXT: _0.ll
+; CHECK-TABLE-EMPTY:
+
+; Regardless of used aspects and sycl-module-id metadata, all kernel and
+; functions should still be present.
+
+; CHECK-IR-DAG: define spir_func void @foo
+; CHECK-IR-DAG: define spir_func void @bar
+; CHECK-IR-DAG: define spir_kernel void @kernel0
+; CHECK-IR-DAG: define spir_kernel void @kernel1
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux"
+
+define spir_func void @foo() #0 !sycl_used_aspects !1 {
+  ret void
+}
+
+define spir_func void @bar() #1 !sycl_used_aspects !2 {
+  ret void
+}
+
+define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 {
+entry:
+  ret void
+}
+
+define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 {
+entry:
+  call void @foo()
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+attributes #1 = { "sycl-module-id"="TU2.cpp" }
+
+!1 = !{i32 1}
+!2 = !{i32 2}
+!3 = !{i32 3, i32 1}
+
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
index ada6c5007ed7d..1ce47af2f0b74 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
@@ -761,5 +761,143 @@ getLargeGRFSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints) {
     return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
 }
 
+namespace {
+// Data structure, which represent a combination of all possible optional
+// features used in a function.
+//
+// It has extra methods to be useable as a key in llvm::DenseMap.
+struct UsedOptionalFeatures {
+  SmallVector<int, 4> Aspects;
+  // TODO: extend this further with reqd-sub-group-size, reqd-work-group-size,
+  // large-grf and other properties
+
+  UsedOptionalFeatures() = default;
+
+  UsedOptionalFeatures(const Function *F) {
+    if (const MDNode *MDN = F->getMetadata("sycl_used_aspects")) {
+      auto ExtractIntegerFromMDNodeOperand = [=](const MDOperand &N) {
+        Constant *C = cast<ConstantAsMetadata>(N.get())->getValue();
+        return C->getUniqueInteger().getSExtValue();
+      };
+
+      // !sycl_used_aspects is supposed to contain unique values, no duplicates
+      // are expected here
+      llvm::transform(MDN->operands(), std::back_inserter(Aspects),
+                      ExtractIntegerFromMDNodeOperand);
+      llvm::sort(Aspects);
+    }
+
+    llvm::hash_code AspectsHash =
+        llvm::hash_combine_range(Aspects.begin(), Aspects.end());
+    Hash = static_cast<unsigned>(llvm::hash_combine(AspectsHash));
+  }
+
+  std::string getName(StringRef BaseName) const {
+    if (Aspects.empty())
+      return BaseName.str() + "-no-aspects";
+
+    std::string Ret = BaseName.str() + "-aspects";
+    for (int A : Aspects) {
+      Ret += "-" + std::to_string(A);
+    }
+    return Ret;
+  }
+
+  static UsedOptionalFeatures getTombstone() {
+    UsedOptionalFeatures Ret;
+    Ret.IsTombstoneKey = true;
+    return Ret;
+  }
+
+  static UsedOptionalFeatures getEmpty() {
+    UsedOptionalFeatures Ret;
+    Ret.IsEmpty = true;
+    return Ret;
+  }
+
+private:
+  // For DenseMap:
+  llvm::hash_code Hash = {};
+  bool IsTombstoneKey = false;
+  bool IsEmpty = false;
+
+public:
+  bool operator==(const UsedOptionalFeatures &Other) const {
+    // Tombstone does not compare equal to any other item
+    if (IsTombstoneKey || Other.IsTombstoneKey)
+      return false;
+
+    if (Aspects.size() != Other.Aspects.size())
+      return false;
+
+    for (size_t I = 0, E = Aspects.size(); I != E; ++I) {
+      if (Aspects[I] != Other.Aspects[I])
+        return false;
+    }
+
+    return IsEmpty == Other.IsEmpty;
+  }
+
+  unsigned hash() const { return static_cast<unsigned>(Hash); }
+};
+
+struct UsedOptionalFeaturesAsKeyInfo {
+  static inline UsedOptionalFeatures getEmptyKey() {
+    return UsedOptionalFeatures::getEmpty();
+  }
+
+  static inline UsedOptionalFeatures getTombstoneKey() {
+    return UsedOptionalFeatures::getTombstone();
+  }
+
+  static unsigned getHashValue(const UsedOptionalFeatures &Value) {
+    return Value.hash();
+  }
+
+  static bool isEqual(const UsedOptionalFeatures &LHS,
+                      const UsedOptionalFeatures &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace
+
+std::unique_ptr<ModuleSplitterBase>
+getSplitterByOptionalFeatures(ModuleDesc &&MD,
+                              bool EmitOnlyKernelsAsEntryPoints) {
+  EntryPointGroupVec Groups;
+
+  DenseMap<UsedOptionalFeatures, EntryPointSet, UsedOptionalFeaturesAsKeyInfo>
+      PropertiesToFunctionsMap;
+
+  Module &M = MD.getModule();
+
+  // Only process module entry points:
+  for (auto &F : M.functions()) {
+    if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) ||
+        !MD.isEntryPointCandidate(F)) {
+      continue;
+    }
+
+    auto Key = UsedOptionalFeatures(&F);
+    PropertiesToFunctionsMap[std::move(Key)].insert(&F);
+  }
+
+  if (PropertiesToFunctionsMap.empty()) {
+    // No entry points met, record this.
+    Groups.emplace_back(GLOBAL_SCOPE_NAME, EntryPointSet{});
+  } else {
+    Groups.reserve(PropertiesToFunctionsMap.size());
+    for (auto &EPG : PropertiesToFunctionsMap) {
+      Groups.emplace_back(EPG.first.getName(MD.getEntryPointGroup().GroupId),
+                          std::move(EPG.second), MD.getEntryPointGroup().Props);
+    }
+  }
+
+  if (Groups.size() > 1)
+    return std::make_unique<ModuleSplitter>(std::move(MD), std::move(Groups));
+  else
+    return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
+}
+
 } // namespace module_split
 } // namespace llvm
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/tools/sycl-post-link/ModuleSplitter.h
index 7088909c8400a..31a70d99c9dc5 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.h
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.h
@@ -253,6 +253,10 @@ getSplitterByMode(ModuleDesc &&MD, IRSplitMode Mode,
 std::unique_ptr<ModuleSplitterBase>
 getLargeGRFSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints);
 
+std::unique_ptr<ModuleSplitterBase>
+getSplitterByOptionalFeatures(ModuleDesc &&MD,
+                              bool EmitOnlyKernelsAsEntryPoints);
+
 #ifndef NDEBUG
 void dumpEntryPoints(const EntryPointSet &C, const char *msg = "", int Tab = 0);
 void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 30ad2da61330c..eb65da70c7d1c 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -756,12 +756,43 @@ processInputModule(std::unique_ptr<Module> M) {
       module_split::getSplitterByMode(module_split::ModuleDesc{std::move(M)},
                                       SplitMode, IROutputOnly,
                                       EmitOnlyKernelsAsEntryPoints);
-  const bool SplitByScope = ScopedSplitter->totalSplits() > 1;
-  Modified |= SplitByScope;
 
+  SmallVector<module_split::ModuleDesc, 8> TopLevelModules;
+  bool SplitByOptionalFeatures = false;
+
+  // FIXME: this check should be performed on all split levels
   if (DeviceGlobals)
     ScopedSplitter->verifyNoCrossModuleDeviceGlobalUsage();
 
+  while (ScopedSplitter->hasMoreSplits()) {
+    module_split::ModuleDesc MD = ScopedSplitter->nextSplit();
+
+    if (IROutputOnly || SplitMode == module_split::SPLIT_NONE) {
+      // We can't perform any kind of split.
+      TopLevelModules.emplace_back(std::move(MD));
+      continue;
+    }
+
+    std::unique_ptr<module_split::ModuleSplitterBase> OptionalFeaturesSplitter =
+        module_split::getSplitterByOptionalFeatures(
+            std::move(MD), EmitOnlyKernelsAsEntryPoints);
+
+    // Here we perform second-level splitting based on device-specific
+    // features used/declared in entry points.
+    // This step is mandatory, because it is required for functional
+    // correctness, i.e. to prevent speculative compilation of kernels that use
+    // optional features on a HW which doesn't support them.
+    while (OptionalFeaturesSplitter->hasMoreSplits()) {
+      TopLevelModules.emplace_back(OptionalFeaturesSplitter->nextSplit());
+    }
+
+    SplitByOptionalFeatures |= OptionalFeaturesSplitter->totalSplits() > 1;
+  }
+
+  const bool SplitByScope = ScopedSplitter->totalSplits() > 1;
+  Modified |= SplitByScope;
+  Modified |= SplitByOptionalFeatures;
+
   // TODO this nested splitting scheme will not scale well when other split
   // "dimensions" will be added. Some infra/"split manager" needs to be
   // implemented in this case - e.g. all needed splitters are registered, then
@@ -769,11 +800,12 @@ processInputModule(std::unique_ptr<Module> M) {
   // "leaf" ModuleDesc's resulted from splitting. Some bookkeeping is needed for
   // ESIMD splitter to link back needed modules.
 
-  // Proceed with top-level splitting.
-  while (ScopedSplitter->hasMoreSplits()) {
-    module_split::ModuleDesc MDesc = ScopedSplitter->nextSplit();
+  // Based on results from the top-level splitting, we perform some lower-level
+  // splitting for various unique features.
+  for (module_split::ModuleDesc &MDesc : TopLevelModules) {
     DUMP_ENTRY_POINTS(MDesc.entries(), MDesc.Name.c_str(), 1);
 
+    // FIXME: large grf should be handled by properties splitter above
     std::unique_ptr<module_split::ModuleSplitterBase> LargeGRFSplitter =
         module_split::getLargeGRFSplitter(std::move(MDesc),
                                           EmitOnlyKernelsAsEntryPoints);
@@ -848,7 +880,9 @@ processInputModule(std::unique_ptr<Module> M) {
         DUMP_ENTRY_POINTS(MMs.back().entries(), MMs.back().Name.c_str(), 3);
         Modified = true;
       }
-      bool SplitOccurred = SplitByScope || SplitByLargeGRF || SplitByESIMD;
+
+      bool SplitOccurred = SplitByScope || SplitByLargeGRF || SplitByESIMD ||
+                           SplitByOptionalFeatures;
 
       if (IROutputOnly) {
         if (SplitOccurred) {