From 27569450c58c82fd717007da9510944993660a04 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 02:55:20 -0400
Subject: [PATCH 1/4] [VectorCombine] Expand `vector_insert` into shufflevector
 for earlier cost optimizations (#145512)

Move folding logic from `InstCombineCalls` to `VectorCombine` to ensure `vector_insert` intrinsics are expanded into shufflevector instructions before cost-based shuffle optimizations run.

Canonicalizes fixed-width vectors only.
---
 .../InstCombine/InstCombineCalls.cpp          | 46 ------------
 .../Transforms/Vectorize/VectorCombine.cpp    | 71 +++++++++++++++++++
 .../VectorCombine/fold-vector-insert.ll       | 71 +++++++++++++++++++
 3 files changed, 142 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5b398d3b75f59..df29024a86f67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3462,52 +3462,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::vector_insert: {
-    Value *Vec = II->getArgOperand(0);
-    Value *SubVec = II->getArgOperand(1);
-    Value *Idx = II->getArgOperand(2);
-    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
-    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
-    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
-
-    // Only canonicalize if the destination vector, Vec, and SubVec are all
-    // fixed vectors.
-    if (DstTy && VecTy && SubVecTy) {
-      unsigned DstNumElts = DstTy->getNumElements();
-      unsigned VecNumElts = VecTy->getNumElements();
-      unsigned SubVecNumElts = SubVecTy->getNumElements();
-      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
-
-      // An insert that entirely overwrites Vec with SubVec is a nop.
-      if (VecNumElts == SubVecNumElts)
-        return replaceInstUsesWith(CI, SubVec);
-
-      // Widen SubVec into a vector of the same width as Vec, since
-      // shufflevector requires the two input vectors to be the same width.
-      // Elements beyond the bounds of SubVec within the widened vector are
-      // undefined.
-      SmallVector<int, 8> WidenMask;
-      unsigned i;
-      for (i = 0; i != SubVecNumElts; ++i)
-        WidenMask.push_back(i);
-      for (; i != VecNumElts; ++i)
-        WidenMask.push_back(PoisonMaskElem);
-
-      Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
-
-      SmallVector<int, 8> Mask;
-      for (unsigned i = 0; i != IdxN; ++i)
-        Mask.push_back(i);
-      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
-        Mask.push_back(i);
-      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
-        Mask.push_back(i);
-
-      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
-      return replaceInstUsesWith(CI, Shuffle);
-    }
-    break;
-  }
   case Intrinsic::vector_extract: {
     Value *Vec = II->getArgOperand(0);
     Value *Idx = II->getArgOperand(1);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 19e82099e87f0..dbbc6c5a07ec8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ class VectorCombine {
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
   bool foldInsExtBinop(Instruction &I);
+  bool foldVectorInsertToShuffle(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
   bool foldBitOpOfBitcasts(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
@@ -804,6 +805,73 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
   return true;
 }
 
+/// Try to fold vector_insert intrinsics into shufflevector instructions.
+bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  // This optimization only applies to vector_insert intrinsics.
+  if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+    return false;
+
+  Value *Vec = II->getArgOperand(0);
+  Value *SubVec = II->getArgOperand(1);
+  Value *Idx = II->getArgOperand(2);
+
+  // Caller guarantees DstTy is a fixed vector.
+  auto *DstTy = cast<FixedVectorType>(II->getType());
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+  // Only canonicalize if Vec and SubVec are both fixed vectors.
+  if (!VecTy || !SubVecTy)
+    return false;
+
+  unsigned DstNumElts = DstTy->getNumElements();
+  unsigned VecNumElts = VecTy->getNumElements();
+  unsigned SubVecNumElts = SubVecTy->getNumElements();
+  auto *SubVecPtr = dyn_cast<ConstantInt>(Idx);
+  if (!SubVecPtr)
+    return false;
+
+  unsigned SubVecIdx = SubVecPtr->getZExtValue();
+
+  // Ensure insertion of SubVec doesn't exceed Dst bounds.
+  if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+    return false;
+
+  // An insert that entirely overwrites Vec with SubVec is a nop.
+  if (VecNumElts == SubVecNumElts) {
+    replaceValue(I, *SubVec);
+    return true;
+  }
+
+  // Widen SubVec into a vector of the same width as Vec, since
+  // shufflevector requires the two input vectors to be the same width.
+  // Elements beyond the bounds of SubVec within the widened vector are
+  // undefined.
+  SmallVector<int, 8> WidenMask;
+  unsigned int i = 0;
+  for (i = 0; i != SubVecNumElts; ++i)
+    WidenMask.push_back(i);
+  for (; i != VecNumElts; ++i)
+    WidenMask.push_back(PoisonMaskElem);
+
+  auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+  Worklist.pushValue(WidenShuffle);
+
+  SmallVector<int, 8> Mask;
+  unsigned int j;
+  for (i = 0; i != SubVecIdx; ++i)
+    Mask.push_back(i);
+  for (j = 0; j != SubVecNumElts; ++j)
+    Mask.push_back(DstNumElts + j);
+  for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i)
+    Mask.push_back(i);
+
+  auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+  replaceValue(I, *Shuffle);
+  return true;
+}
+
 bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
   // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
   Value *LHSSrc, *RHSSrc;
@@ -3639,6 +3707,9 @@ bool VectorCombine::run() {
     // dispatching to folding functions if there's no chance of matching.
     if (IsFixedVectorType) {
       switch (Opcode) {
+      case Instruction::Call:
+        MadeChange |= foldVectorInsertToShuffle(I);
+        break;
       case Instruction::InsertElement:
         MadeChange |= vectorizeLoadInsert(I);
         break;
diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
new file mode 100644
index 0000000000000..976fdb322005b
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32>, <2 x i32>, i64)
+
+define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_begin(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_middle(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    ret <8 x i32> [[SUBVEC]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7)
+  ret <8 x i32> %result
+}
+
+define <vscale x 4 x i32> @vector_insert_no_fold_scalable(<vscale x 4 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <vscale x 4 x i32> @vector_insert_no_fold_scalable(
+; CHECK-SAME: <vscale x 4 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> [[VEC]], <2 x i32> [[SUBVEC]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RESULT]]
+;
+  %result = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> %vec, <2 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %result
+}

From fc894c8d658554dbb04e5a99c3c8f1aeef986220 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 04:52:10 -0400
Subject: [PATCH 2/4] [VectorCombine] Move canonicalize-vector-insert tests
 from InstCombine to VectorCombine

---
 .../canonicalize-vector-insert.ll             | 19 +++--
 .../VectorCombine/fold-vector-insert.ll       | 71 -------------------
 2 files changed, 15 insertions(+), 75 deletions(-)
 rename llvm/test/Transforms/{InstCombine => VectorCombine}/canonicalize-vector-insert.ll (84%)
 delete mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll

diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
similarity index 84%
rename from llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
rename to llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
index ab7a50e55db0f..af6fe52c07920 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
 
 ; llvm.vector.insert canonicalizes to shufflevector in the fixed case. In the
 ; scalable case, we lower to the INSERT_SUBVECTOR ISD node.
@@ -31,7 +31,7 @@ define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) {
 define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_a(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0)
@@ -71,7 +71,7 @@ define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) {
 define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_e(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
@@ -91,7 +91,7 @@ define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) {
 define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_g(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0)
@@ -108,6 +108,17 @@ define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) {
   ret <8 x i32> %1
 }
 
+; Tests insertion at middle index
+define <8 x i32> @valid_insertion_i(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_i(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %result
+}
+
 ; ============================================================================ ;
 ; Scalable cases
 ; ============================================================================ ;
diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
deleted file mode 100644
index 976fdb322005b..0000000000000
--- a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
-
-declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32>, <2 x i32>, i64)
-
-define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_begin(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_middle(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_end(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    ret <8 x i32> [[SUBVEC]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7)
-  ret <8 x i32> %result
-}
-
-define <vscale x 4 x i32> @vector_insert_no_fold_scalable(<vscale x 4 x i32> %vec, <2 x i32> %subvec) {
-; CHECK-LABEL: define <vscale x 4 x i32> @vector_insert_no_fold_scalable(
-; CHECK-SAME: <vscale x 4 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[RESULT:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> [[VEC]], <2 x i32> [[SUBVEC]], i64 0)
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[RESULT]]
-;
-  %result = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> %vec, <2 x i32> %subvec, i64 0)
-  ret <vscale x 4 x i32> %result
-}

From 78a18d03e9fca922403550762369a3c2202ba7fb Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 11:44:43 -0400
Subject: [PATCH 3/4] [VectorCombine] Use std::iota for shuffle mask
 construction

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index dbbc6c5a07ec8..55c320103afb2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -848,24 +848,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   // shufflevector requires the two input vectors to be the same width.
   // Elements beyond the bounds of SubVec within the widened vector are
   // undefined.
-  SmallVector<int, 8> WidenMask;
-  unsigned int i = 0;
-  for (i = 0; i != SubVecNumElts; ++i)
-    WidenMask.push_back(i);
-  for (; i != VecNumElts; ++i)
-    WidenMask.push_back(PoisonMaskElem);
+  SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
+  std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
+  std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem);
 
   auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
   Worklist.pushValue(WidenShuffle);
 
-  SmallVector<int, 8> Mask;
-  unsigned int j;
-  for (i = 0; i != SubVecIdx; ++i)
-    Mask.push_back(i);
-  for (j = 0; j != SubVecNumElts; ++j)
-    Mask.push_back(DstNumElts + j);
-  for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i)
-    Mask.push_back(i);
+  SmallVector<int, 8> Mask(DstNumElts);
+  std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0);
+  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts);
+  std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts);
 
   auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
   replaceValue(I, *Shuffle);

From 2abc2e342342cd1ad3c6367dfb2b78b2fa283839 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 13:22:04 -0400
Subject: [PATCH 4/4] [VectorCombine] Remove redundant `fill` and reduce three
 loops to two `iota`  calls

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 55c320103afb2..609bbbdea2c6b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -835,7 +835,7 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   unsigned SubVecIdx = SubVecPtr->getZExtValue();
 
   // Ensure insertion of SubVec doesn't exceed Dst bounds.
-  if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+  if ((SubVecIdx % SubVecNumElts != 0) || (SubVecIdx + SubVecNumElts > DstNumElts))
     return false;
 
   // An insert that entirely overwrites Vec with SubVec is a nop.
@@ -850,18 +850,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   // undefined.
   SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
   std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
-  std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem);
 
   auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
   Worklist.pushValue(WidenShuffle);
 
   SmallVector<int, 8> Mask(DstNumElts);
-  std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0);
-  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts);
-  std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts,
+            DstNumElts);
 
-  auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
-  replaceValue(I, *Shuffle);
+  auto *InsertShuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+  replaceValue(I, *InsertShuffle);
   return true;
 }