diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 52cb1dbb33b86..71fb02d872c6a 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -224,6 +224,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); unsigned OffsetEltIndex = 0; + unsigned VectorRange = 0; + bool NeedCast = false; Align Alignment = Load->getAlign(); if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, &DT)) { @@ -240,15 +242,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { if (Offset.isNegative()) return false; - // The offset must be a multiple of the scalar element to shuffle cleanly - // in the element's size. + // If Offset is multiple of a Scalar element, it can be shuffled to the + // element's size; otherwise, Offset and Scalar must be shuffled to the + // appropriate element size for both. uint64_t ScalarSizeInBytes = ScalarSize / 8; - if (Offset.urem(ScalarSizeInBytes) != 0) - return false; + if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes)) { + if (DL->isBigEndian()) + return false; + uint64_t OldScalarSizeInBytes = ScalarSizeInBytes; + // Assign the greatest common divisor between UnalignedBytes and Offset to + // ScalarSizeInBytes + ScalarSizeInBytes = std::gcd(ScalarSizeInBytes, UnalignedBytes); + ScalarSize = ScalarSizeInBytes * 8; + VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes; + MinVecNumElts = MinVectorSize / ScalarSize; + ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize); + MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + NeedCast = true; + } - // If we load MinVecNumElts, will our target element still be loaded? OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); - if (OffsetEltIndex >= MinVecNumElts) + // If we load MinVecNumElts, will our target element still be loaded? + if (OffsetEltIndex + VectorRange >= MinVecNumElts) return false; if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC, @@ -266,11 +281,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment); Type *LoadTy = Load->getType(); unsigned AS = Load->getPointerAddressSpace(); + auto VecTy = cast(&I)->getType(); + InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind); - APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + APInt DemandedElts = + APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0); OldCost += - TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + TTI.getScalarizationOverhead(VecTy, DemandedElts, /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr @@ -283,15 +301,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // We assume this operation has no cost in codegen if there was no offset. // Note that we could use freeze to avoid poison problems, but then we might // still need a shuffle to change the vector size. - auto *Ty = cast(I.getType()); - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector Mask(OutputNumElts, PoisonMaskElem); - assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); - Mask[0] = OffsetEltIndex; + SmallVector Mask; + assert(OffsetEltIndex + VectorRange < MinVecNumElts && + "Address offset too big"); + if (NeedCast) { + Mask.assign(MinVecNumElts, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex); + } else { + auto *Ty = cast(I.getType()); + unsigned OutputNumElts = Ty->getNumElements(); + Mask.assign(OutputNumElts, PoisonMaskElem); + Mask[0] = OffsetEltIndex; + } + if (OffsetEltIndex) NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind); + if (NeedCast) + NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy, + TargetTransformInfo::CastContextHint::None, + CostKind); + // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. if (OldCost < NewCost || !NewCost.isValid()) @@ -300,12 +331,17 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); + Value *Result; Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS)); - Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - VecLd = Builder.CreateShuffleVector(VecLd, Mask); + Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + Worklist.pushValue(Result); + Result = Builder.CreateShuffleVector(Result, Mask); + Worklist.pushValue(Result); + if (NeedCast) + Result = Builder.CreateBitOrPointerCast(Result, I.getType()); - replaceValue(I, *VecLd); + replaceValue(I, *Result); ++NumVecLoad; return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 73476308916fb..f387928696a9e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -290,23 +290,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer ret <8 x i16> %r } -; Negative test - if we are shuffling a load from the base pointer, the address offset -; must be a multiple of element size. -; TODO: Could bitcast around this limitation. +define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} -define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync { -; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 -; CHECK-NEXT: ret <4 x i32> [[R]] +define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i32, ptr %gep, align 1 %r = insertelement <4 x i32> poison, i32 %s, i64 0 ret <4 x i32> %r } +define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0 +; SSE2-NEXT: ret <2 x i64> [[R]] +; +; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64> +; AVX2-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + +define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) { +; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5 +; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; SSE2-NEXT: ret <4 x i32> [[R]] +; +; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> +; AVX2-NEXT: ret <4 x i32> [[R]] +; + %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5 + %s = load i32, ptr %gep, align 1 + %r = insertelement <4 x i32> poison, i32 %s, i64 0 + ret <4 x i32> %r +} + +define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) { +; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1 + %s = load i64, ptr %gep, align 1 + %r = insertelement <2 x i64> poison, i64 %s, i64 0 + ret <2 x i64> %r +} + define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1