Skip to content

[AggressiveInstCombine] Implement store merge optimization #147540

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,138 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
return true;
}

/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
struct PartStore {
Value *PtrBase;
APInt PtrOffset;
Value *Val;
uint64_t ValOffset;
uint64_t ValWidth;
StoreInst *Store;

bool isCompatibleWith(const PartStore &Other) const {
return PtrBase == Other.PtrBase && Val == Other.Val;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: We can merge two stores with different constant value operands.
See https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2560/files#r2192918824

}

bool operator<(const PartStore &Other) const {
return PtrOffset.slt(Other.PtrOffset);
}
};

static std::optional<PartStore> matchPartStore(Instruction &I,
const DataLayout &DL) {
auto *Store = dyn_cast<StoreInst>(&I);
if (!Store || !Store->isSimple())
return std::nullopt;

Value *StoredVal = Store->getValueOperand();
Type *StoredTy = StoredVal->getType();
if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
return std::nullopt;

uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
uint64_t ValOffset = 0;
Value *Val;
if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
m_ConstantInt(ValOffset))),
m_Trunc(m_Value(Val)))))
return std::nullopt;

Value *Ptr = Store->getPointerOperand();
APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
DL, PtrOffset, /*AllowNonInbounds=*/true);
return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
}

static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
const DataLayout &DL, TargetTransformInfo &TTI) {
if (Parts.size() < 2)
return false;

// We now have multiple parts of the same value stored to the same pointer.
// Sort the parts by pointer offset, and make sure they are consistent with
// the value offsets. Also check that the value is fully covered without
// overlaps.
// FIXME: We could support merging stores for only part of the value here.
llvm::sort(Parts);
int64_t LastEndOffsetFromFirst = 0;
const PartStore &First = Parts[0];
for (const PartStore &Part : Parts) {
APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset;
int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset;
if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
LastEndOffsetFromFirst != ValOffsetFromFirst)
return false;
LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
}

// Check whether combining the stores is profitable.
// FIXME: We could generate smaller stores if we can't produce a large one.
LLVMContext &Ctx = First.Store->getContext();
Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst);
unsigned Fast = 0;
if (!TTI.isTypeLegal(NewTy) ||
!TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst,
First.Store->getPointerAddressSpace(),
First.Store->getAlign(), &Fast) ||
!Fast)
return false;

// Generate the combined store.
IRBuilder<> Builder(First.Store);
Value *Val = First.Val;
if (First.ValOffset != 0)
Val = Builder.CreateLShr(Val, First.ValOffset);
Val = Builder.CreateTrunc(Val, NewTy);
StoreInst *Store = Builder.CreateAlignedStore(
Val, First.Store->getPointerOperand(), First.Store->getAlign());

AAMDNodes AATags = First.Store->getAAMetadata();
for (const PartStore &Part : drop_begin(Parts))
AATags = AATags.concat(Part.Store->getAAMetadata());
Store->setAAMetadata(AATags);

// Remove the old stores.
for (const PartStore &Part : Parts)
Part.Store->eraseFromParent();

return true;
}

static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
TargetTransformInfo &TTI, AliasAnalysis &AA) {
// FIXME: Add big endian support.
if (DL.isBigEndian())
return false;

SmallVector<PartStore, 8> Parts;
bool MadeChange = false;
for (Instruction &I : make_early_inc_range(BB)) {
if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
Parts.push_back(std::move(*Part));
continue;
}

MadeChange |= mergePartStores(Parts, DL, TTI);
Parts.clear();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add some tests for multiple store groups:

define void @test_multi_group(i16 %x, ptr %p1, i16 %y, ptr %p2) {
  %x.0 = trunc i16 %x to i8
  store i8 %x.0, ptr %p1
  %shr.1 = lshr i16 %x, 8
  %x.1 = trunc i16 %shr.1 to i8
  %gep.1 = getelementptr i8, ptr %p1, i64 1
  store i8 %x.1, ptr %gep.1
  call void @may_unwind()
  %y.0 = trunc i16 %y to i8
  store i8 %y.0, ptr %p2
  %shr.2 = lshr i16 %y, 8
  %y.1 = trunc i16 %shr.2 to i8
  %gep.2 = getelementptr i8, ptr %p2, i64 1
  store i8 %y.1, ptr %gep.2
  ret void
}

Parts.push_back(std::move(*Part));
continue;
}

// FIXME: Use AA to make this more precise.
if (I.mayReadOrWriteMemory() || I.mayThrow()) {
MadeChange |= mergePartStores(Parts, DL, TTI);
Parts.clear();
continue;
}
}

MadeChange |= mergePartStores(Parts, DL, TTI);
return MadeChange;
}

/// Combine away instructions providing they are still equivalent when compared
/// against 0. i.e do they have any bits set.
static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) {
Expand Down Expand Up @@ -1330,6 +1462,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
}

// Do this separately to avoid redundantly scanning stores multiple times.
MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
}

// We're done with transforms, so remove dead instructions.
Expand Down
106 changes: 106 additions & 0 deletions llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s

; Pretend X86 is big endian.

; FIXME: Big endian not supported yet.

define void @test_i32_be(i32 %x, ptr %p) {
; CHECK-LABEL: define void @test_i32_be(
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 24
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[TMP1]] to i8
; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
; CHECK-NEXT: ret void
;
%x.0 = trunc i32 %x to i8
%gep.0 = getelementptr i8, ptr %p, i64 3
store i8 %x.0, ptr %gep.0
%shr.1 = lshr i32 %x, 8
%x.1 = trunc i32 %shr.1 to i8
%gep.1 = getelementptr i8, ptr %p, i64 2
store i8 %x.1, ptr %gep.1
%shr.2 = lshr i32 %x, 16
%x.2 = trunc i32 %shr.2 to i8
%gep.2 = getelementptr i8, ptr %p, i64 1
store i8 %x.2, ptr %gep.2
%shr.3 = lshr i32 %x, 24
%x.3 = trunc i32 %shr.3 to i8
store i8 %x.3, ptr %p
ret void
}

define void @test_i32_le(i32 %x, ptr %p) {
; CHECK-LABEL: define void @test_i32_le(
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
; CHECK-NEXT: store i8 [[X_1]], ptr [[GEP_1]], align 1
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16
; CHECK-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
; CHECK-NEXT: store i8 [[X_2]], ptr [[GEP_2]], align 1
; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1
; CHECK-NEXT: ret void
;
%x.0 = trunc i32 %x to i8
store i8 %x.0, ptr %p
%shr.1 = lshr i32 %x, 8
%x.1 = trunc i32 %shr.1 to i8
%gep.1 = getelementptr i8, ptr %p, i64 1
store i8 %x.1, ptr %gep.1
%shr.2 = lshr i32 %x, 16
%x.2 = trunc i32 %shr.2 to i8
%gep.2 = getelementptr i8, ptr %p, i64 2
store i8 %x.2, ptr %gep.2
%shr.3 = lshr i32 %x, 24
%x.3 = trunc i32 %shr.3 to i8
%gep.3 = getelementptr i8, ptr %p, i64 3
store i8 %x.3, ptr %gep.3
ret void
}

define void @test_i32_mixed_parts(i32 %x, ptr %p) {
; CHECK-LABEL: define void @test_i32_mixed_parts(
; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
; CHECK-NEXT: store i8 [[X_0]], ptr [[GEP_0]], align 1
; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8
; CHECK-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
; CHECK-NEXT: store i16 [[X_1]], ptr [[GEP_1]], align 2
; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24
; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
; CHECK-NEXT: store i8 [[X_3]], ptr [[P]], align 1
; CHECK-NEXT: ret void
;
%x.0 = trunc i32 %x to i8
%gep.0 = getelementptr i8, ptr %p, i64 3
store i8 %x.0, ptr %gep.0
%shr.1 = lshr i32 %x, 8
%x.1 = trunc i32 %shr.1 to i16
%gep.1 = getelementptr i8, ptr %p, i64 1
store i16 %x.1, ptr %gep.1
%shr.3 = lshr i32 %x, 24
%x.3 = trunc i32 %shr.3 to i8
store i8 %x.3, ptr %p
ret void
}
Loading