diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index a3a0e31f887ab..7fa6e6c5161cf 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -886,35 +886,20 @@ static std::optional matchPartStore(Instruction &I, return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}}; } -static bool mergePartStores(SmallVectorImpl &Parts, - const DataLayout &DL, TargetTransformInfo &TTI) { +static bool mergeConsecutivePartStores(ArrayRef Parts, + unsigned Width, const DataLayout &DL, + TargetTransformInfo &TTI) { if (Parts.size() < 2) return false; - // We now have multiple parts of the same value stored to the same pointer. - // Sort the parts by pointer offset, and make sure they are consistent with - // the value offsets. Also check that the value is fully covered without - // overlaps. - // FIXME: We could support merging stores for only part of the value here. - llvm::sort(Parts); - int64_t LastEndOffsetFromFirst = 0; - const PartStore &First = Parts[0]; - for (const PartStore &Part : Parts) { - APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset; - int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset; - if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst || - LastEndOffsetFromFirst != ValOffsetFromFirst) - return false; - LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth; - } - // Check whether combining the stores is profitable. // FIXME: We could generate smaller stores if we can't produce a large one. + const PartStore &First = Parts.front(); LLVMContext &Ctx = First.Store->getContext(); - Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst); + Type *NewTy = Type::getIntNTy(Ctx, Width); unsigned Fast = 0; if (!TTI.isTypeLegal(NewTy) || - !TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst, + !TTI.allowsMisalignedMemoryAccesses(Ctx, Width, First.Store->getPointerAddressSpace(), First.Store->getAlign(), &Fast) || !Fast) @@ -941,6 +926,39 @@ static bool mergePartStores(SmallVectorImpl &Parts, return true; } +static bool mergePartStores(SmallVectorImpl &Parts, + const DataLayout &DL, TargetTransformInfo &TTI) { + if (Parts.size() < 2) + return false; + + // We now have multiple parts of the same value stored to the same pointer. + // Sort the parts by pointer offset, and make sure they are consistent with + // the value offsets. Also check that the value is fully covered without + // overlaps. + bool Changed = false; + llvm::sort(Parts); + int64_t LastEndOffsetFromFirst = 0; + const PartStore *First = &Parts[0]; + for (const PartStore &Part : Parts) { + APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset; + int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset; + if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst || + LastEndOffsetFromFirst != ValOffsetFromFirst) { + Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part), + LastEndOffsetFromFirst, DL, TTI); + First = &Part; + LastEndOffsetFromFirst = Part.ValWidth; + continue; + } + + LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth; + } + + Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()), + LastEndOffsetFromFirst, DL, TTI); + return Changed; +} + static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA) { // FIXME: Add big endian support. diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll index 38a55e1566a77..4ab8d18eb69b5 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll @@ -792,6 +792,105 @@ define void @test_i32_tbaa(i32 %x, ptr %p) { ret void } +define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap1( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; CHECK-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: store i8 [[X_3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap2( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; CHECK-NEXT: store i8 [[X_0]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + %gep.2 = getelementptr i8, ptr %p, i64 1 + store i8 %x.2, ptr %gep.2 + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 2 + store i8 %x.3, ptr %gep.3 + ret void +} + +define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) { +; CHECK-LABEL: define void @test_multiple_parts_with_gap3( +; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16 +; CHECK-NEXT: store i16 [[TMP3]], ptr [[GEP_3]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i64 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i64 %x, 8 + %x.1 = trunc i64 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %shr.3 = lshr i64 %x, 24 + %x.3 = trunc i64 %shr.3 to i8 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.3, ptr %gep.3 + %shr.4 = lshr i64 %x, 32 + %x.4 = trunc i64 %shr.4 to i8 + %gep.4 = getelementptr i8, ptr %p, i64 4 + store i8 %x.4, ptr %gep.4 + ret void +} + +define void @test_store_same_parts_twice(i32 %x, ptr %p) { +; CHECK-LABEL: define void @test_store_same_parts_twice( +; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[P]], align 1 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[X]] to i16 +; CHECK-NEXT: store i16 [[TMP2]], ptr [[GEP_2]], align 1 +; CHECK-NEXT: ret void +; + %x.0 = trunc i32 %x to i8 + store i8 %x.0, ptr %p + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + %gep.1 = getelementptr i8, ptr %p, i64 1 + store i8 %x.1, ptr %gep.1 + %gep.2 = getelementptr i8, ptr %p, i64 2 + store i8 %x.0, ptr %gep.2 + %gep.3 = getelementptr i8, ptr %p, i64 3 + store i8 %x.1, ptr %gep.3 + ret void +} + !0 = !{!1} !1 = !{!1, !2} !2 = !{!2}