Skip to content

[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize #119139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
68 changes: 52 additions & 16 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
unsigned OffsetEltIndex = 0;
unsigned VectorRange = 0;
bool NeedCast = false;
Align Alignment = Load->getAlign();
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
&DT)) {
Expand All @@ -240,15 +242,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
if (Offset.isNegative())
return false;

// The offset must be a multiple of the scalar element to shuffle cleanly
// in the element's size.
// If Offset is multiple of a Scalar element, it can be shuffled to the
// element's size; otherwise, Offset and Scalar must be shuffled to the
// appropriate element size for both.
uint64_t ScalarSizeInBytes = ScalarSize / 8;
if (Offset.urem(ScalarSizeInBytes) != 0)
return false;
if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes)) {
if (DL->isBigEndian())
return false;
uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
// Assign the greatest common divisor between UnalignedBytes and Offset to
// ScalarSizeInBytes
ScalarSizeInBytes = std::gcd(ScalarSizeInBytes, UnalignedBytes);
ScalarSize = ScalarSizeInBytes * 8;
VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
MinVecNumElts = MinVectorSize / ScalarSize;
ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
NeedCast = true;
}

// If we load MinVecNumElts, will our target element still be loaded?
OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
if (OffsetEltIndex >= MinVecNumElts)
// If we load MinVecNumElts, will our target element still be loaded?
if (OffsetEltIndex + VectorRange >= MinVecNumElts)
return false;

if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
Expand All @@ -266,11 +281,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
Type *LoadTy = Load->getType();
unsigned AS = Load->getPointerAddressSpace();
auto VecTy = cast<InsertElementInst>(&I)->getType();

InstructionCost OldCost =
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
APInt DemandedElts =
APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
OldCost +=
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
TTI.getScalarizationOverhead(VecTy, DemandedElts,
/* Insert */ true, HasExtract, CostKind);

// New pattern: load VecPtr
Expand All @@ -283,15 +301,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// We assume this operation has no cost in codegen if there was no offset.
// Note that we could use freeze to avoid poison problems, but then we might
// still need a shuffle to change the vector size.
auto *Ty = cast<FixedVectorType>(I.getType());
unsigned OutputNumElts = Ty->getNumElements();
SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
Mask[0] = OffsetEltIndex;
SmallVector<int> Mask;
assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
"Address offset too big");
if (NeedCast) {
Mask.assign(MinVecNumElts, PoisonMaskElem);
std::iota(Mask.begin(), Mask.begin() + VectorRange, OffsetEltIndex);
} else {
auto *Ty = cast<FixedVectorType>(I.getType());
unsigned OutputNumElts = Ty->getNumElements();
Mask.assign(OutputNumElts, PoisonMaskElem);
Mask[0] = OffsetEltIndex;
}

if (OffsetEltIndex)
NewCost +=
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);

if (NeedCast)
NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
TargetTransformInfo::CastContextHint::None,
CostKind);

// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
if (OldCost < NewCost || !NewCost.isValid())
Expand All @@ -300,12 +331,17 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);
Value *Result;
Value *CastedPtr =
Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
VecLd = Builder.CreateShuffleVector(VecLd, Mask);
Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
Worklist.pushValue(Result);
Result = Builder.CreateShuffleVector(Result, Mask);
Worklist.pushValue(Result);
if (NeedCast)
Result = Builder.CreateBitOrPointerCast(Result, I.getType());

replaceValue(I, *VecLd);
replaceValue(I, *Result);
++NumVecLoad;
return true;
}
Expand Down
128 changes: 119 additions & 9 deletions llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
Original file line number Diff line number Diff line change
Expand Up @@ -290,23 +290,133 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
ret <8 x i16> %r
}

; Negative test - if we are shuffling a load from the base pointer, the address offset
; must be a multiple of element size.
; TODO: Could bitcast around this limitation.
define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
; SSE2-NEXT: ret <4 x i32> [[R]]
;
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
; AVX2-NEXT: ret <4 x i32> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> poison, i32 %s, i64 0
ret <4 x i32> %r
}

define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
; CHECK-NEXT: ret <4 x i32> [[R]]
define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
; SSE2-NEXT: ret <2 x i64> [[R]]
;
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
; AVX2-NEXT: ret <2 x i64> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
%s = load i64, ptr %gep, align 1
%r = insertelement <2 x i64> poison, i64 %s, i64 0
ret <2 x i64> %r
}

define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
; SSE2-NEXT: ret <4 x i32> [[R]]
;
; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
; AVX2-NEXT: ret <4 x i32> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> poison, i32 %s, i64 0
ret <4 x i32> %r
}

define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
; SSE2-NEXT: ret <4 x i32> [[R]]
;
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
; AVX2-NEXT: ret <4 x i32> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> poison, i32 %s, i64 0
ret <4 x i32> %r
}

define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
; SSE2-NEXT: ret <2 x i64> [[R]]
;
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
; AVX2-NEXT: ret <2 x i64> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
%s = load i64, ptr %gep, align 1
%r = insertelement <2 x i64> poison, i64 %s, i64 0
ret <2 x i64> %r
}

define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
; SSE2-NEXT: ret <4 x i32> [[R]]
;
; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
; AVX2-NEXT: ret <4 x i32> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
%s = load i32, ptr %gep, align 1
%r = insertelement <4 x i32> poison, i32 %s, i64 0
ret <4 x i32> %r
}

define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
%s = load i64, ptr %gep, align 1
%r = insertelement <2 x i64> poison, i64 %s, i64 0
ret <2 x i64> %r
}

define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
Expand Down