-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AggressiveInstCombine] Support store merge with non-consecutive parts #149807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is a minor extension of llvm#147540, resolving one of the FIXMEs. If the collected parts contain some non-consecutive elements, we can still handle smaller ranges that *are* consecutive. This is not common in practice and mostly shows up when the same value is stored at two different offsets. llvm-opt-benchmark results: dtcxzyw/llvm-opt-benchmark#2593
@llvm/pr-subscribers-llvm-transforms Author: Nikita Popov (nikic) ChangesThis is a minor extension of #147540, resolving one of the FIXMEs. If the collected parts contain some non-consecutive elements, we can still handle smaller ranges that are consecutive. This is not common in practice and mostly shows up when the same value is stored at two different offsets. llvm-opt-benchmark results: dtcxzyw/llvm-opt-benchmark#2593 Full diff: https://github.com/llvm/llvm-project/pull/149807.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index a3a0e31f887ab..7fa6e6c5161cf 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -886,35 +886,20 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
}
-static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
- const DataLayout &DL, TargetTransformInfo &TTI) {
+static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
+ unsigned Width, const DataLayout &DL,
+ TargetTransformInfo &TTI) {
if (Parts.size() < 2)
return false;
- // We now have multiple parts of the same value stored to the same pointer.
- // Sort the parts by pointer offset, and make sure they are consistent with
- // the value offsets. Also check that the value is fully covered without
- // overlaps.
- // FIXME: We could support merging stores for only part of the value here.
- llvm::sort(Parts);
- int64_t LastEndOffsetFromFirst = 0;
- const PartStore &First = Parts[0];
- for (const PartStore &Part : Parts) {
- APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset;
- int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset;
- if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
- LastEndOffsetFromFirst != ValOffsetFromFirst)
- return false;
- LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
- }
-
// Check whether combining the stores is profitable.
// FIXME: We could generate smaller stores if we can't produce a large one.
+ const PartStore &First = Parts.front();
LLVMContext &Ctx = First.Store->getContext();
- Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst);
+ Type *NewTy = Type::getIntNTy(Ctx, Width);
unsigned Fast = 0;
if (!TTI.isTypeLegal(NewTy) ||
- !TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst,
+ !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
First.Store->getPointerAddressSpace(),
First.Store->getAlign(), &Fast) ||
!Fast)
@@ -941,6 +926,39 @@ static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
return true;
}
+static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
+ const DataLayout &DL, TargetTransformInfo &TTI) {
+ if (Parts.size() < 2)
+ return false;
+
+ // We now have multiple parts of the same value stored to the same pointer.
+ // Sort the parts by pointer offset, and make sure they are consistent with
+ // the value offsets. Also check that the value is fully covered without
+ // overlaps.
+ bool Changed = false;
+ llvm::sort(Parts);
+ int64_t LastEndOffsetFromFirst = 0;
+ const PartStore *First = &Parts[0];
+ for (const PartStore &Part : Parts) {
+ APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
+ int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
+ if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
+ LastEndOffsetFromFirst != ValOffsetFromFirst) {
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
+ LastEndOffsetFromFirst, DL, TTI);
+ First = &Part;
+ LastEndOffsetFromFirst = Part.ValWidth;
+ continue;
+ }
+
+ LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
+ }
+
+ Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
+ LastEndOffsetFromFirst, DL, TTI);
+ return Changed;
+}
+
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
TargetTransformInfo &TTI, AliasAnalysis &AA) {
// FIXME: Add big endian support.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
index 38a55e1566a77..4ab8d18eb69b5 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
@@ -792,6 +792,105 @@ define void @test_i32_tbaa(i32 %x, ptr %p) {
ret void
}
+define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.2 = lshr i32 %x, 16
+ %x.2 = trunc i32 %shr.2 to i8
+ %gep.2 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.2, ptr %gep.2
+ %shr.3 = lshr i32 %x, 24
+ %x.3 = trunc i32 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.3, ptr %gep.3
+ ret void
+}
+
+define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap3(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X]], 24
+; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i64 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i64 %x, 8
+ %x.1 = trunc i64 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %shr.3 = lshr i64 %x, 24
+ %x.3 = trunc i64 %shr.3 to i8
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.3, ptr %gep.3
+ %shr.4 = lshr i64 %x, 32
+ %x.4 = trunc i64 %shr.4 to i8
+ %gep.4 = getelementptr i8, ptr %p, i64 4
+ store i8 %x.4, ptr %gep.4
+ ret void
+}
+
+define void @test_store_same_parts_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_store_same_parts_twice(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT: ret void
+;
+ %x.0 = trunc i32 %x to i8
+ store i8 %x.0, ptr %p
+ %shr.1 = lshr i32 %x, 8
+ %x.1 = trunc i32 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1
+ %gep.2 = getelementptr i8, ptr %p, i64 2
+ store i8 %x.0, ptr %gep.2
+ %gep.3 = getelementptr i8, ptr %p, i64 3
+ store i8 %x.1, ptr %gep.3
+ ret void
+}
+
!0 = !{!1}
!1 = !{!1, !2}
!2 = !{!2}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG.
However, I have a question for the remaining FIXME:
FIXME: We could generate smaller stores if we can't produce a large one.
Imagine we have three consecutive stores:
store 0 | store 1 | store 2
^
|
aligned to 2 bytes
Fast unaligned memory access may not be as fast as aligned access because the former one may access multiple cache lines. In this case, store i8 + store i16
is better than store i16 + store i8
. So I don't think that a greedy strategy like this patch works well.
I think this can be mostly addressed by only combining stores, but not trying to split them up in new ways. So if we have stores i8, i16 we would not try to rewrite them to i16, i8. This does mean we'd also exclude rewriting i8, i16, i8 to i16, i16 (for the hypothetical case where unaligned i32 store is not supported). |
This is a minor extension of #147540, resolving one of the FIXMEs. If the collected parts contain some non-consecutive elements, we can still handle smaller ranges that are consecutive.
This is not common in practice and mostly shows up when the same value is stored at two different offsets.
llvm-opt-benchmark results: dtcxzyw/llvm-opt-benchmark#2593