diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9bbb89e37865d..d4319c5c3ac69 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -430,6 +430,10 @@ class CodeGenPrepare { bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT); bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); + Value *splitLastVectorIndex(Instruction *MemoryInst, + const GetElementPtrInst *GEP); + Value *reassociateVectorOps(Instruction *MemoryInst, + const GetElementPtrInst *GEP); bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT); @@ -6235,23 +6239,176 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return true; } +Value *CodeGenPrepare::splitLastVectorIndex(Instruction *MemoryInst, + const GetElementPtrInst *GEP) { + SmallVector Ops(GEP->operands()); + + bool RewriteGEP = false; + + if (Ops[0]->getType()->isVectorTy()) { + Ops[0] = getSplatValue(Ops[0]); + if (!Ops[0]) + return nullptr; + RewriteGEP = true; + } + + unsigned FinalIndex = Ops.size() - 1; + + // Ensure all but the last index is 0. + // FIXME: This isn't strictly required. All that's required is that they are + // all scalars or splats. + for (unsigned i = 1; i < FinalIndex; ++i) { + auto *C = dyn_cast(Ops[i]); + if (!C) + return nullptr; + if (isa(C->getType())) + C = C->getSplatValue(); + auto *CI = dyn_cast_or_null(C); + if (!CI || !CI->isZero()) + return nullptr; + // Scalarize the index if needed. + Ops[i] = CI; + } + + // Try to scalarize the final index. + if (Ops[FinalIndex]->getType()->isVectorTy()) { + if (Value *V = getSplatValue(Ops[FinalIndex])) { + auto *C = dyn_cast(V); + // Don't scalarize all zeros vector. + if (!C || !C->isZero()) { + Ops[FinalIndex] = V; + RewriteGEP = true; + } + } + } + + // If we made any changes or the we have extra operands, we need to generate + // new instructions. + if (!RewriteGEP && Ops.size() == 2) + return nullptr; + + auto NumElts = cast(GEP->getType())->getElementCount(); + + IRBuilder<> Builder(MemoryInst); + + Type *SourceTy = GEP->getSourceElementType(); + Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType()); + + // If the final index isn't a vector, emit a scalar GEP containing all ops + // and a vector GEP with all zeroes final index. + if (!Ops[FinalIndex]->getType()->isVectorTy()) { + Value *NewAddr = + Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front()); + auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts); + auto *SecondTy = + GetElementPtrInst::getIndexedType(SourceTy, ArrayRef(Ops).drop_front()); + return Builder.CreateGEP(SecondTy, NewAddr, + Constant::getNullValue(IndexTy)); + } + + Value *Base = Ops[0]; + Value *Index = Ops[FinalIndex]; + + // Create a scalar GEP if there are more than 2 operands. + if (Ops.size() != 2) { + // Replace the last index with 0. + Ops[FinalIndex] = + Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType()); + Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front()); + SourceTy = + GetElementPtrInst::getIndexedType(SourceTy, ArrayRef(Ops).drop_front()); + } + + // Now create the GEP with scalar pointer and vector index. + return Builder.CreateGEP(SourceTy, Base, Index); +} + +/// The addressing performed by a GEP is simply a set of adds. Adds are +/// by definition reassociable. If the indices of a this gep contain +/// both scalar and vector indices, we can split the result into two +/// GEPs (zeroing out a subset of indices in each) while computing the +/// same results. We do have to be careful to only zero indices where +/// that doesn't change the type traversed (i.e. not for structs). Doing +/// so has the effect of grouping all vector arithmetic after all scalar +/// arithmetic, and encourages scalar base identification. +Value *CodeGenPrepare::reassociateVectorOps(Instruction *MemoryInst, + const GetElementPtrInst *GEP) { + SmallVector Ops(GEP->operands()); + const unsigned E = Ops.size(); + + if (Ops[0]->getType()->isVectorTy()) { + Ops[0] = getSplatValue(Ops[0]); + if (!Ops[0]) + return nullptr; + } + + // Check for at least one non-zero scalar index and one vector index each, + // and aren't trying to iterate through a struct type where changing the + // index to zero would be illegal. + bool HasNonZeroScalar = false, HasNonZeroVector = false; + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned i = 1; i < E; ++i, ++GTI) { + if (Value *V = getSplatValue(Ops[i])) + Ops[i] = V; + + // Zeros don't count in terms of splitting, and zero struct + // indices are fine + if (match(Ops[i], m_Zero())) + continue; + + if (GTI.getStructTypeOrNull()) + return nullptr; + + if (isa(Ops[i]->getType())) + HasNonZeroVector = true; + else + HasNonZeroScalar = true; + } + + if (!HasNonZeroVector || !HasNonZeroScalar) + return nullptr; + + SmallVector ScalarOps(Ops); + SmallVector VectorOps(Ops); + for (unsigned i = 1; i < E; i++) { + auto *IdxTy = Ops[i]->getType()->getScalarType(); + auto *ScalarZero = Constant::getNullValue(IdxTy); + if (isa(Ops[i]->getType())) + ScalarOps[i] = ScalarZero; + else + VectorOps[i] = ScalarZero; + } + + IRBuilder<> Builder(MemoryInst); + Type *SourceTy = GEP->getSourceElementType(); + Value *Base = Ops[0]; + Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(ScalarOps).drop_front()); + Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(VectorOps).drop_front()); + return Base; +} + /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can -/// only handle a 2 operand GEP in the same basic block or a splat constant -/// vector. The 2 operands to the GEP must have a scalar pointer and a vector -/// index. +/// only handle a GEP with a scalar pointer and one non-zero vector index in +/// the same basic block or a splat constant vector. We use GEP with a +/// vector zeroinitializer input for canonical splat as SelectionDAG has +/// trouble with splats which might be in different blocks. /// /// If the existing GEP has a vector base pointer that is splat, we can look /// through the splat to find the scalar pointer. If we can't find a scalar /// pointer there's nothing we can do. /// -/// If we have a GEP with more than 2 indices where the middle indices are all -/// zeroes, we can replace it with 2 GEPs where the second has 2 operands. -/// -/// If the final index isn't a vector or is a splat, we can emit a scalar GEP -/// followed by a GEP with an all zeroes vector index. This will enable -/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a -/// zero index. +/// With this goal in mind, we perform three related transforms: +/// 1) If the GEP could be entirely scalarized, we do so and emit a separate +/// getelementptr p, zeroinitializer to splat the result. +/// 2) If the GEP can be scalarized all except for the last index, we split +/// the gep into a scalar prefix, and a base + vectoridx GEP for the +/// final index. +/// 3) If the GEP has a mixture of scalar and vector indices, we split the +/// GEP into a pair of GEPs with some of the indices zeroed out in each. +/// This essentially reassociates the GEP such that all scalar addressing +/// is done before all vector addressing. This transform has restrictions +/// when indexing through struct types. bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr) { Value *NewAddr; @@ -6266,85 +6423,12 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, if (MemoryInst->getParent() != GEP->getParent()) return false; - SmallVector Ops(GEP->operands()); - - bool RewriteGEP = false; - - if (Ops[0]->getType()->isVectorTy()) { - Ops[0] = getSplatValue(Ops[0]); - if (!Ops[0]) - return false; - RewriteGEP = true; - } - - unsigned FinalIndex = Ops.size() - 1; - - // Ensure all but the last index is 0. - // FIXME: This isn't strictly required. All that's required is that they are - // all scalars or splats. - for (unsigned i = 1; i < FinalIndex; ++i) { - auto *C = dyn_cast(Ops[i]); - if (!C) - return false; - if (isa(C->getType())) - C = C->getSplatValue(); - auto *CI = dyn_cast_or_null(C); - if (!CI || !CI->isZero()) - return false; - // Scalarize the index if needed. - Ops[i] = CI; - } - - // Try to scalarize the final index. - if (Ops[FinalIndex]->getType()->isVectorTy()) { - if (Value *V = getSplatValue(Ops[FinalIndex])) { - auto *C = dyn_cast(V); - // Don't scalarize all zeros vector. - if (!C || !C->isZero()) { - Ops[FinalIndex] = V; - RewriteGEP = true; - } - } - } - - // If we made any changes or the we have extra operands, we need to generate - // new instructions. - if (!RewriteGEP && Ops.size() == 2) + if (auto *V = splitLastVectorIndex(MemoryInst, GEP)) + NewAddr = V; + else if (auto *V = reassociateVectorOps(MemoryInst, GEP)) + NewAddr = V; + else return false; - - auto NumElts = cast(Ptr->getType())->getElementCount(); - - IRBuilder<> Builder(MemoryInst); - - Type *SourceTy = GEP->getSourceElementType(); - Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType()); - - // If the final index isn't a vector, emit a scalar GEP containing all ops - // and a vector GEP with all zeroes final index. - if (!Ops[FinalIndex]->getType()->isVectorTy()) { - NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front()); - auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts); - auto *SecondTy = GetElementPtrInst::getIndexedType( - SourceTy, ArrayRef(Ops).drop_front()); - NewAddr = - Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy)); - } else { - Value *Base = Ops[0]; - Value *Index = Ops[FinalIndex]; - - // Create a scalar GEP if there are more than 2 operands. - if (Ops.size() != 2) { - // Replace the last index with 0. - Ops[FinalIndex] = - Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType()); - Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front()); - SourceTy = GetElementPtrInst::getIndexedType( - SourceTy, ArrayRef(Ops).drop_front()); - } - - // Now create the GEP with scalar pointer and vector index. - NewAddr = Builder.CreateGEP(SourceTy, Base, Index); - } } else if (!isa(Ptr)) { // Not a GEP, maybe its a splat and we can create a GEP to enable // SelectionDAGBuilder to use it as a uniform base. diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 3633885bfa7d2..6cd29b4df5bd9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2386,11 +2386,11 @@ define <4 x i32> @scalar_prefix(ptr %base, i32 signext %index, <4 x i32> %vecidx ; ; RV64-LABEL: scalar_prefix: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 ; RV64-NEXT: slli a1, a1, 10 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vwmulsu.vx v10, v8, a2 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 4 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vwmulsu.vx v10, v8, a1 ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, i32 %index, <4 x i32> %vecidx @@ -2401,26 +2401,22 @@ define <4 x i32> @scalar_prefix(ptr %base, i32 signext %index, <4 x i32> %vecidx define <4 x i32> @scalar_prefix_with_splat(ptr %base, i32 %index, <4 x i32> %vecidx) { ; RV32-LABEL: scalar_prefix_with_splat: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 10 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsll.vi v9, v9, 10 -; RV32-NEXT: vadd.vx v9, v9, a0 ; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vluxei32.v v8, (zero), v8 +; RV32-NEXT: vluxei32.v v8, (a0), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: scalar_prefix_with_splat: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 1024 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a2 -; RV64-NEXT: vwmaccsu.vx v10, a1, v9 -; RV64-NEXT: li a0, 4 -; RV64-NEXT: vwmaccus.vx v10, a0, v8 -; RV64-NEXT: vluxei64.v v8, (zero), v10 +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: slli a1, a1, 10 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 4 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vwmulsu.vx v10, v8, a1 +; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer @@ -2442,11 +2438,11 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i32> %vecidx ; ; RV64-LABEL: scalar_prefix_with_constant_splat: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 5 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: li a1, 4 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vwmulsu.vx v10, v8, a1 -; RV64-NEXT: lui a1, 5 -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> splat (i32 20), <4 x i32> %vecidx @@ -2457,25 +2453,22 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i32> %vecidx define <4 x i32> @reassociate(ptr %base, i32 %index, <4 x i32> %vecidx) { ; RV32-LABEL: reassociate: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 10 -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vadd.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v9, v9, 2 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: vluxei32.v v8, (zero), v8 +; RV32-NEXT: vluxei32.v v8, (a0), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: reassociate: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: li a0, 1024 -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwmaccus.vx v10, a0, v8 -; RV64-NEXT: vmv.v.i v8, 4 -; RV64-NEXT: vwmaccsu.vx v10, a1, v8 -; RV64-NEXT: vluxei64.v v8, (zero), v10 +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 1024 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vwmulsu.vx v10, v8, a1 +; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, i32 %index %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep, i32 4, <4 x i1> , <4 x i32> undef) @@ -2485,25 +2478,22 @@ define <4 x i32> @reassociate(ptr %base, i32 %index, <4 x i32> %vecidx) { define <4 x i32> @reassociate_with_splat(ptr %base, i32 %index, <4 x i32> %vecidx) { ; RV32-LABEL: reassociate_with_splat: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsll.vi v8, v8, 10 -; RV32-NEXT: vadd.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v9, v9, 2 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: vluxei32.v v8, (zero), v8 +; RV32-NEXT: vluxei32.v v8, (a0), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: reassociate_with_splat: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: li a0, 1024 -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwmaccus.vx v10, a0, v8 -; RV64-NEXT: vmv.v.i v8, 4 -; RV64-NEXT: vwmaccsu.vx v10, a1, v8 -; RV64-NEXT: vluxei64.v v8, (zero), v10 +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 1024 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vwmulsu.vx v10, v8, a1 +; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer @@ -2516,18 +2506,18 @@ define <4 x i32> @reassociate_with_splat(ptr %base, i32 %index, <4 x i32> %vecid define <4 x i32> @reassociate_with_constant_splat(ptr %base, i32 %index, <4 x i32> %vecidx) { ; RV32-LABEL: reassociate_with_constant_splat: ; RV32: # %bb.0: +; RV32-NEXT: addi a0, a0, 80 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 10 -; RV32-NEXT: addi a0, a0, 80 ; RV32-NEXT: vluxei32.v v8, (a0), v8 ; RV32-NEXT: ret ; ; RV64-LABEL: reassociate_with_constant_splat: ; RV64: # %bb.0: +; RV64-NEXT: addi a0, a0, 80 ; RV64-NEXT: li a1, 1024 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vwmulsu.vx v10, v8, a1 -; RV64-NEXT: addi a0, a0, 80 ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, <4 x i32> splat (i32 20) diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll index e27d5d772a7a4..09932fe403e4e 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll @@ -112,7 +112,9 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) { define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) { ; CHECK-LABEL: @scalar_prefix( -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], <4 x i64> [[VECIDX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]] ; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> [[RES]] ; @@ -123,9 +125,9 @@ define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) { define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vecidx) { ; CHECK-LABEL: @scalar_prefix_with_splat( -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> [[VECIDX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]] ; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> [[RES]] ; @@ -139,7 +141,9 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx) { ; CHECK-LABEL: @scalar_prefix_with_constant_splat( -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> splat (i64 20), <4 x i64> [[VECIDX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 20, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]] ; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> [[RES]] ; @@ -150,7 +154,8 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) { ; CHECK-LABEL: @reassociate( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], i64 [[INDEX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 0, i64 [[INDEX:%.*]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [256 x i32], ptr [[TMP1]], <4 x i64> [[VECIDX:%.*]], i64 0 ; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> [[RES]] ; @@ -161,9 +166,8 @@ define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) { define <4 x i32> @reassociate_with_splat(ptr %base, i64 %index, <4 x i64> %vecidx) { ; CHECK-LABEL: @reassociate_with_splat( -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], <4 x i64> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 0, i64 [[INDEX:%.*]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [256 x i32], ptr [[TMP1]], <4 x i64> [[VECIDX:%.*]], i64 0 ; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> [[RES]] ;