Skip to content

Commit 0752759

Browse files
authored
[AggressiveInstCombine] Support store merge with non-consecutive parts (#149807)
This is a minor extension of #147540, resolving one of the FIXMEs. If the collected parts contain some non-consecutive elements, we can still handle smaller ranges that *are* consecutive. This is not common in practice and mostly shows up when the same value is stored at two different offsets.
1 parent 8c14d3f commit 0752759

File tree

2 files changed

+138
-21
lines changed

2 files changed

+138
-21
lines changed

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -886,35 +886,20 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
886886
return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
887887
}
888888

889-
static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
890-
const DataLayout &DL, TargetTransformInfo &TTI) {
889+
static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
890+
unsigned Width, const DataLayout &DL,
891+
TargetTransformInfo &TTI) {
891892
if (Parts.size() < 2)
892893
return false;
893894

894-
// We now have multiple parts of the same value stored to the same pointer.
895-
// Sort the parts by pointer offset, and make sure they are consistent with
896-
// the value offsets. Also check that the value is fully covered without
897-
// overlaps.
898-
// FIXME: We could support merging stores for only part of the value here.
899-
llvm::sort(Parts);
900-
int64_t LastEndOffsetFromFirst = 0;
901-
const PartStore &First = Parts[0];
902-
for (const PartStore &Part : Parts) {
903-
APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset;
904-
int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset;
905-
if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
906-
LastEndOffsetFromFirst != ValOffsetFromFirst)
907-
return false;
908-
LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
909-
}
910-
911895
// Check whether combining the stores is profitable.
912896
// FIXME: We could generate smaller stores if we can't produce a large one.
897+
const PartStore &First = Parts.front();
913898
LLVMContext &Ctx = First.Store->getContext();
914-
Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst);
899+
Type *NewTy = Type::getIntNTy(Ctx, Width);
915900
unsigned Fast = 0;
916901
if (!TTI.isTypeLegal(NewTy) ||
917-
!TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst,
902+
!TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
918903
First.Store->getPointerAddressSpace(),
919904
First.Store->getAlign(), &Fast) ||
920905
!Fast)
@@ -941,6 +926,39 @@ static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
941926
return true;
942927
}
943928

929+
static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
930+
const DataLayout &DL, TargetTransformInfo &TTI) {
931+
if (Parts.size() < 2)
932+
return false;
933+
934+
// We now have multiple parts of the same value stored to the same pointer.
935+
// Sort the parts by pointer offset, and make sure they are consistent with
936+
// the value offsets. Also check that the value is fully covered without
937+
// overlaps.
938+
bool Changed = false;
939+
llvm::sort(Parts);
940+
int64_t LastEndOffsetFromFirst = 0;
941+
const PartStore *First = &Parts[0];
942+
for (const PartStore &Part : Parts) {
943+
APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
944+
int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
945+
if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
946+
LastEndOffsetFromFirst != ValOffsetFromFirst) {
947+
Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
948+
LastEndOffsetFromFirst, DL, TTI);
949+
First = &Part;
950+
LastEndOffsetFromFirst = Part.ValWidth;
951+
continue;
952+
}
953+
954+
LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
955+
}
956+
957+
Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
958+
LastEndOffsetFromFirst, DL, TTI);
959+
return Changed;
960+
}
961+
944962
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
945963
TargetTransformInfo &TTI, AliasAnalysis &AA) {
946964
// FIXME: Add big endian support.

llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,105 @@ define void @test_i32_tbaa(i32 %x, ptr %p) {
792792
ret void
793793
}
794794

795+
define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) {
796+
; CHECK-LABEL: define void @test_multiple_parts_with_gap1(
797+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
798+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
799+
; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
800+
; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
801+
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
802+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
803+
; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
804+
; CHECK-NEXT: ret void
805+
;
806+
%x.0 = trunc i32 %x to i8
807+
store i8 %x.0, ptr %p
808+
%shr.1 = lshr i32 %x, 8
809+
%x.1 = trunc i32 %shr.1 to i8
810+
%gep.1 = getelementptr i8, ptr %p, i64 1
811+
store i8 %x.1, ptr %gep.1
812+
%shr.3 = lshr i32 %x, 24
813+
%x.3 = trunc i32 %shr.3 to i8
814+
%gep.3 = getelementptr i8, ptr %p, i64 3
815+
store i8 %x.3, ptr %gep.3
816+
ret void
817+
}
818+
819+
define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) {
820+
; CHECK-LABEL: define void @test_multiple_parts_with_gap2(
821+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
822+
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
823+
; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
824+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
825+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 16
826+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
827+
; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
828+
; CHECK-NEXT: ret void
829+
;
830+
%x.0 = trunc i32 %x to i8
831+
store i8 %x.0, ptr %p
832+
%shr.2 = lshr i32 %x, 16
833+
%x.2 = trunc i32 %shr.2 to i8
834+
%gep.2 = getelementptr i8, ptr %p, i64 1
835+
store i8 %x.2, ptr %gep.2
836+
%shr.3 = lshr i32 %x, 24
837+
%x.3 = trunc i32 %shr.3 to i8
838+
%gep.3 = getelementptr i8, ptr %p, i64 2
839+
store i8 %x.3, ptr %gep.3
840+
ret void
841+
}
842+
843+
define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) {
844+
; CHECK-LABEL: define void @test_multiple_parts_with_gap3(
845+
; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
846+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X]] to i16
847+
; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
848+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
849+
; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X]], 24
850+
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
851+
; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP_3]], align 1
852+
; CHECK-NEXT: ret void
853+
;
854+
%x.0 = trunc i64 %x to i8
855+
store i8 %x.0, ptr %p
856+
%shr.1 = lshr i64 %x, 8
857+
%x.1 = trunc i64 %shr.1 to i8
858+
%gep.1 = getelementptr i8, ptr %p, i64 1
859+
store i8 %x.1, ptr %gep.1
860+
%shr.3 = lshr i64 %x, 24
861+
%x.3 = trunc i64 %shr.3 to i8
862+
%gep.3 = getelementptr i8, ptr %p, i64 3
863+
store i8 %x.3, ptr %gep.3
864+
%shr.4 = lshr i64 %x, 32
865+
%x.4 = trunc i64 %shr.4 to i8
866+
%gep.4 = getelementptr i8, ptr %p, i64 4
867+
store i8 %x.4, ptr %gep.4
868+
ret void
869+
}
870+
871+
define void @test_store_same_parts_twice(i32 %x, ptr %p) {
872+
; CHECK-LABEL: define void @test_store_same_parts_twice(
873+
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
874+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
875+
; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
876+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
877+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[X]] to i16
878+
; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
879+
; CHECK-NEXT: ret void
880+
;
881+
%x.0 = trunc i32 %x to i8
882+
store i8 %x.0, ptr %p
883+
%shr.1 = lshr i32 %x, 8
884+
%x.1 = trunc i32 %shr.1 to i8
885+
%gep.1 = getelementptr i8, ptr %p, i64 1
886+
store i8 %x.1, ptr %gep.1
887+
%gep.2 = getelementptr i8, ptr %p, i64 2
888+
store i8 %x.0, ptr %gep.2
889+
%gep.3 = getelementptr i8, ptr %p, i64 3
890+
store i8 %x.1, ptr %gep.3
891+
ret void
892+
}
893+
795894
!0 = !{!1}
796895
!1 = !{!1, !2}
797896
!2 = !{!2}

0 commit comments

Comments
 (0)