From 4cee120c7e13906bc12fc92fd1c0997922174757 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sat, 7 Dec 2024 07:00:41 +0900 Subject: [PATCH 1/8] add test-cases --- .../VectorCombine/X86/load-inseltpoison.ll | 82 ++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 73476308916fb..c58a162543efe 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -294,8 +294,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer ; must be a multiple of element size. ; TODO: Could bitcast around this limitation. -define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync { -; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( +define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 @@ -307,6 +307,84 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl ret <4 x i32> %r } +define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11 +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5 +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 From 33a7de82ad94fe2a0e41c344afa0eb7d1b16aae6 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sat, 7 Dec 2024 06:00:08 +0900 Subject: [PATCH 2/8] [VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize Previously, vectorization for load-insert failed when the Offset was not a multiple of the Load type size. This patch allow it in two steps, 1. Vectorize it using a common multiple of Offset and LoadSize. 2. Bitcast to fit Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ --- .../Transforms/Vectorize/VectorCombine.cpp | 76 +++++++++--- .../VectorCombine/X86/load-inseltpoison.ll | 108 ++++++++++++------ 2 files changed, 130 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 746742e14d080..7c1ba361ce4ee 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -205,6 +205,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (!canWidenLoad(Load, TTI)) return false; + auto MaxCommonDivisor = [](int n) { + if (n % 4 == 0) + return 4; + if (n % 2 == 0) + return 2; + else + return 1; + }; + Type *ScalarTy = Scalar->getType(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); @@ -219,6 +228,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); unsigned OffsetEltIndex = 0; + unsigned VectorRange = 0; + bool NeedCast = false; Align Alignment = Load->getAlign(); if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, &DT)) { @@ -235,15 +246,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (Offset.isNegative()) return false; - // The offset must be a multiple of the scalar element to shuffle cleanly - // in the element's size. + // If Offset is multiple of a Scalar element, it can be shuffled to the + // element's size; otherwise, Offset and Scalar must be shuffled to the + // appropriate element size for both. uint64_t ScalarSizeInBytes = ScalarSize / 8; - if (Offset.urem(ScalarSizeInBytes) != 0) - return false; + if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes); + UnalignedBytes != 0) { + uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; + // Assign the greatest common divisor between UnalignedBytes and Offset to + // ScalarSizeInBytes + ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes); + ScalarSize = ScalarSizeInBytes * 8; + VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes; + MinVecNumElts = MinVectorSize / ScalarSize; + ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize); + MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + NeedCast = true; + } - // If we load MinVecNumElts, will our target element still be loaded? OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); - if (OffsetEltIndex >= MinVecNumElts) + // If we load MinVecNumElts, will our target element still be loaded? + if (OffsetEltIndex + VectorRange >= MinVecNumElts) return false; if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, @@ -261,11 +284,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); Type *LoadTy = Load->getType(); unsigned AS = Load->getPointerAddressSpace(); + auto VecTy = cast(&I)->getType(); + InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); - APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + APInt DemandedElts = + APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0); OldCost += - TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + TTI.getScalarizationOverhead(VecTy, DemandedElts, /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr @@ -278,15 +304,29 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // We assume this operation has no cost in codegen if there was no offset. // Note that we could use freeze to avoid poison problems, but then we might // still need a shuffle to change the vector size. - auto *Ty = cast(I.getType()); - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector Mask(OutputNumElts, PoisonMaskElem); - assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); - Mask[0] = OffsetEltIndex; + SmallVector Mask; + assert(OffsetEltIndex + VectorRange < MinVecNumElts && + "Address offset too big"); + if (!NeedCast) { + auto *Ty = cast(I.getType()); + unsigned OutputNumElts = Ty->getNumElements(); + Mask.assign(OutputNumElts, PoisonMaskElem); + Mask[0] = OffsetEltIndex; + } else { + Mask.assign(MinVecNumElts, PoisonMaskElem); + for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++) + Mask[InsertPos] = OffsetEltIndex++; + } + if (OffsetEltIndex) NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind); + if (NeedCast) + NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy, + TargetTransformInfo::CastContextHint::None, + CostKind); + // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. if (OldCost < NewCost || !NewCost.isValid()) @@ -295,12 +335,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); + Value *Result; Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); - Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - VecLd = Builder.CreateShuffleVector(VecLd, Mask); + Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + Result = Builder.CreateShuffleVector(Result, Mask); - replaceValue(I, *VecLd); + if (NeedCast) + Result = Builder.CreateBitOrPointerCast(Result, I.getType()); + + replaceValue(I, *Result); ++NumVecLoad; return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index c58a162543efe..f387928696a9e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -290,16 +290,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer ret <8 x i16> %r } -; Negative test - if we are shuffling a load from the base pointer, the address offset -; must be a multiple of element size. -; TODO: Could bitcast around this limitation. - define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 %s = load i32, ptr %gep, align 1 @@ -308,11 +310,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der } define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 -; CHECK-NEXT: ret <2 x i64> [[R]] +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 %s = load i64, ptr %gep, align 1 @@ -321,11 +329,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der } define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11 %s = load i32, ptr %gep, align 1 @@ -334,11 +348,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der } define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i32, ptr %gep, align 1 @@ -347,11 +367,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der } define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 -; CHECK-NEXT: ret <2 x i64> [[R]] +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i64, ptr %gep, align 1 @@ -360,11 +386,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der } define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { -; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5 %s = load i32, ptr %gep, align 1 @@ -372,11 +404,11 @@ define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der ret <4 x i32> %r } -define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync { +define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) { ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[R]] ; %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1 From 3de48b7612bfbc6b0a945a14ef1e0cff894b58fe Mon Sep 17 00:00:00 2001 From: hanbeom Date: Mon, 7 Apr 2025 05:25:40 +0900 Subject: [PATCH 3/8] BigEndian check update --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7c1ba361ce4ee..62d5eb6fa9582 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -252,6 +252,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { uint64_t ScalarSizeInBytes = ScalarSize / 8; if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes); UnalignedBytes != 0) { + if (DL->isBigEndian()) + return false; uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; // Assign the greatest common divisor between UnalignedBytes and Offset to // ScalarSizeInBytes From 0e3f580ac4d31950532d9a074c8c00f373370229 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Tue, 8 Apr 2025 06:32:26 +0900 Subject: [PATCH 4/8] use std::gcd instead self code --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 62d5eb6fa9582..ac299bbe6ccdd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -205,15 +205,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (!canWidenLoad(Load, TTI)) return false; - auto MaxCommonDivisor = [](int n) { - if (n % 4 == 0) - return 4; - if (n % 2 == 0) - return 2; - else - return 1; - }; - Type *ScalarTy = Scalar->getType(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); @@ -257,7 +248,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; // Assign the greatest common divisor between UnalignedBytes and Offset to // ScalarSizeInBytes - ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes); + ScalarSizeInBytes = std::gcd(ScalarSizeInBytes, UnalignedBytes); ScalarSize = ScalarSizeInBytes * 8; VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes; MinVecNumElts = MinVectorSize / ScalarSize; From 9bfc721aff3f6730b93fdd79dda7c43367642379 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Tue, 8 Apr 2025 06:33:35 +0900 Subject: [PATCH 5/8] new created IR push to Worklist --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ac299bbe6ccdd..08d74652728ad 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -332,10 +332,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + Worklist.pushValue(Result); Result = Builder.CreateShuffleVector(Result, Mask); - - if (NeedCast) + Worklist.pushValue(Result); + if (NeedCast) { Result = Builder.CreateBitOrPointerCast(Result, I.getType()); + Worklist.pushValue(Result); + } replaceValue(I, *Result); ++NumVecLoad; From 6a196461dfa70ad8e2f90c7c719942a092ffb30b Mon Sep 17 00:00:00 2001 From: hanbeom Date: Tue, 29 Apr 2025 14:35:38 +0900 Subject: [PATCH 6/8] remove unnecessary checks --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 08d74652728ad..fb6f57f9fc9b1 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -241,8 +241,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // element's size; otherwise, Offset and Scalar must be shuffled to the // appropriate element size for both. uint64_t ScalarSizeInBytes = ScalarSize / 8; - if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes); - UnalignedBytes != 0) { + if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes)) { if (DL->isBigEndian()) return false; uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; From 84914a7760402cb96b5272367464c2c4e6c9e499 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Tue, 29 Apr 2025 14:36:41 +0900 Subject: [PATCH 7/8] replace the for statement with std::iota and relocate the conditional statement --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fb6f57f9fc9b1..e4306f97593e7 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -299,15 +299,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { SmallVector Mask; assert(OffsetEltIndex + VectorRange < MinVecNumElts && "Address offset too big"); - if (!NeedCast) { + if (NeedCast) { + Mask.assign(MinVecNumElts, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex); + } else { auto *Ty = cast(I.getType()); unsigned OutputNumElts = Ty->getNumElements(); Mask.assign(OutputNumElts, PoisonMaskElem); Mask[0] = OffsetEltIndex; - } else { - Mask.assign(MinVecNumElts, PoisonMaskElem); - for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++) - Mask[InsertPos] = OffsetEltIndex++; } if (OffsetEltIndex) From 3277e2e9401c0d12e96e5d8f4d9d40fae5a81cee Mon Sep 17 00:00:00 2001 From: hanbeom Date: Tue, 29 Apr 2025 14:37:58 +0900 Subject: [PATCH 8/8] remove duplicate pushes to Worklist replaceValue adds new instruction to the worklist internally, so don't need to push it to the worklist to remove it. --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e4306f97593e7..a733902737c5a 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -333,10 +333,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Worklist.pushValue(Result); Result = Builder.CreateShuffleVector(Result, Mask); Worklist.pushValue(Result); - if (NeedCast) { + if (NeedCast) Result = Builder.CreateBitOrPointerCast(Result, I.getType()); - Worklist.pushValue(Result); - } replaceValue(I, *Result); ++NumVecLoad;