Skip to content

[SelectionDAG] Fix and improve TargetLowering::SimplifySetCC #87646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,13 @@ class TargetLoweringBase {
/// where the sext is redundant, and use x directly.
virtual bool shouldRemoveRedundantExtend(SDValue Op) const { return true; }

/// Indicates if any padding is guaranteed to go at the most significant bits
/// when storing the type to memory and the type size isn't equal to the store
/// size.
bool isPaddedAtMostSignificantBitsWhenStored(EVT VT) const {
return VT.isScalarInteger() && !VT.isByteSized();
}

/// When splitting a value of the specified type into parts, does the Lo
/// or Hi part come first? This usually follows the endianness, except
/// for ppcf128, where the Hi part always comes first.
Expand Down
68 changes: 41 additions & 27 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4621,48 +4621,62 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
APInt bestMask;
unsigned bestWidth = 0, bestOffset = 0;
if (Lod->isSimple() && Lod->isUnindexed()) {
if (Lod->isSimple() && Lod->isUnindexed() &&
(Lod->getMemoryVT().isByteSized() ||
isPaddedAtMostSignificantBitsWhenStored(Lod->getMemoryVT()))) {
unsigned memWidth = Lod->getMemoryVT().getStoreSizeInBits();
unsigned origWidth = N0.getValueSizeInBits();
unsigned maskWidth = origWidth;
// We can narrow (e.g.) 16-bit extending loads on 32-bit target to
// 8 bits, but have to be careful...
if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
origWidth = Lod->getMemoryVT().getSizeInBits();
const APInt &Mask = N0.getConstantOperandAPInt(1);
for (unsigned width = origWidth / 2; width>=8; width /= 2) {
// Only consider power-of-2 widths (and at least one byte) as candiates
// for the narrowed load.
for (unsigned width = 8; width < origWidth; width *= 2) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), width);
if (!shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT))
continue;
APInt newMask = APInt::getLowBitsSet(maskWidth, width);
for (unsigned offset=0; offset<origWidth/width; offset++) {
// Avoid accessing any padding here for now (we could use memWidth
// instead of origWidth here otherwise).
unsigned maxOffset = origWidth - width;
for (unsigned offset = 0; offset <= maxOffset; offset += 8) {
if (Mask.isSubsetOf(newMask)) {
if (Layout.isLittleEndian())
bestOffset = (uint64_t)offset * (width/8);
else
bestOffset = (origWidth/width - offset - 1) * (width/8);
bestMask = Mask.lshr(offset * (width/8) * 8);
bestWidth = width;
break;
unsigned ptrOffset =
Layout.isLittleEndian() ? offset : memWidth - width - offset;
unsigned IsFast = 0;
Align NewAlign = commonAlignment(Lod->getAlign(), ptrOffset / 8);
if (allowsMemoryAccess(
*DAG.getContext(), Layout, newVT, Lod->getAddressSpace(),
NewAlign, Lod->getMemOperand()->getFlags(), &IsFast) &&
IsFast) {
bestOffset = ptrOffset / 8;
bestMask = Mask.lshr(offset);
bestWidth = width;
break;
}
}
newMask <<= width;
newMask <<= 8;
}
if (bestWidth)
break;
}
}
if (bestWidth) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
if (newVT.isRound() &&
shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
SDValue Ptr = Lod->getBasePtr();
if (bestOffset != 0)
Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(bestOffset),
dl);
SDValue NewLoad =
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
Lod->getPointerInfo().getWithOffset(bestOffset),
Lod->getOriginalAlign());
return DAG.getSetCC(dl, VT,
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
DAG.getConstant(bestMask.trunc(bestWidth),
dl, newVT)),
DAG.getConstant(0LL, dl, newVT), Cond);
}
SDValue Ptr = Lod->getBasePtr();
if (bestOffset != 0)
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(bestOffset));
SDValue NewLoad =
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
Lod->getPointerInfo().getWithOffset(bestOffset),
Lod->getOriginalAlign());
SDValue And =
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
DAG.getConstant(bestMask.trunc(bestWidth), dl, newVT));
return DAG.getSetCC(dl, VT, And, DAG.getConstant(0LL, dl, newVT), Cond);
}
}

Expand Down
33 changes: 16 additions & 17 deletions llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-BE-LABEL: test_129_15_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrh r0, [r0, #14]
; CHECK-BE-NEXT: ldr r1, [r0, #12]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a regression. It is supposed to be fixing a miscompile.

The 15bits masked out with the AND is at [r0, #15] and [r0, #16]. I think the optimization to narrow to a 16-bit load is blocked due to loading 16 bits starting at [r0, #15] would be an unaligned load.

; CHECK-BE-NEXT: ldrb r0, [r0, #16]
; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8
; CHECK-BE-NEXT: mov r1, #255
; CHECK-BE-NEXT: orr r1, r1, #32512
; CHECK-BE-NEXT: ands r0, r0, r1
Expand All @@ -49,7 +51,7 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-V7-BE-LABEL: test_129_15_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #14]
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #15]
; CHECK-V7-BE-NEXT: bfc r0, #15, #17
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
Expand Down Expand Up @@ -119,14 +121,14 @@ define i1 @test_33_8_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_8_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-BE-NEXT: cmp r0, #0
; CHECK-BE-NEXT: movne r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_8_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
; CHECK-V7-BE-NEXT: bx lr
Expand Down Expand Up @@ -179,13 +181,13 @@ define i1 @test_33_1_31(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_31:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: ldrb r0, [r0, #1]
; CHECK-BE-NEXT: lsr r0, r0, #7
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_1_31:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #1]
; CHECK-V7-BE-NEXT: lsr r0, r0, #7
; CHECK-V7-BE-NEXT: bx lr
%a = load i33, ptr %y
Expand All @@ -209,13 +211,13 @@ define i1 @test_33_1_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-BE-NEXT: and r0, r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_1_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-V7-BE-NEXT: and r0, r0, #1
; CHECK-V7-BE-NEXT: bx lr
%a = load i33, ptr %y
Expand Down Expand Up @@ -309,7 +311,7 @@ define i1 @test_48_16_8(ptr %y) {
; CHECK-LE-LABEL: test_48_16_8:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldrh r0, [r0, #1]
; CHECK-LE-NEXT: cmp r0, #0
; CHECK-LE-NEXT: lsls r0, r0, #8
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if lsls or cmp is better than the other here for ARM?

What happens is that after load narrowing we get:

          t17: i32,ch = load<(load (s32) from %ir.y, align 8)> t0, t2, undef:i32
        t19: i32 = and t17, Constant:i32<16776960>
      t21: i1 = setcc t19, Constant:i32<0>, setne:ch

and then the DAG combiner triggers on the AND and changes it into

            t23: i32 = add nuw t2, Constant:i32<1>
          t24: i32,ch = load<(load (s16) from %ir.y + 1, align 1, basealign 8), zext from i16> t0, t23, undef:i32
        t26: i32 = shl t24, Constant:i32<8>
      t21: i1 = setcc t26, Constant:i32<0>, setne:ch

I think the optimization in this patch first avoids introducing a misaligned 16-bit load from %ir.y + 1 and instead uses a 32-bit load. But then some other DAG combine is narrowing the load a second time, resulting in the unaligned load, but also introducing an SHL that isn't needed for the comparison with 0.

Makes me wonder if the 16 bit load with align 1 a bad thing here?

It also seems like we lack some optimization that removes the redundant SHL :-(

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the difference is that this patch checks the Fast result from the allowsMemoryAccess check. But DAGCombiner::isLegalNarrowLdSt is only checking if it is legal (not if the narrowed, not naturally aligned, access also is considered as Fast).
So the new optimization in this patch is a bit more strict, and then we suffer from DAGCombiner::reduceLoadWidth introducing a SHL that isn't really needed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you looked into this any further?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you looked into this any further?

See 5e9f960#r1552445909

After that I also made sure the test is run both with -mtriple arm and -mtriple armv7 to show the difference in ARMTargetLowering::allowsMisalignedMemoryAccesses depending on the actual triple.
My new code is adhering to the "Fast" settings in ARMTargetLowering, while DAGCombiner::isLegalNarrowLdSt isn't. But maybe we want to do these transforms regardless if the unaligned accesses are considered as "Fast"? It felt wrong to me to make it that aggressive.

I'm also not sure why this code in DAGCombiner.cpp

  // Ensure that this isn't going to produce an unsupported memory access.
  if (ShAmt) {
    assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
    const unsigned ByteShAmt = ShAmt / 8;
    const Align LDSTAlign = LDST->getAlign();
    const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
    if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
                                LDST->getAddressSpace(), NarrowAlign,
                                LDST->getMemOperand()->getFlags()))
      return false;
  }

is skipping to consider the "Fast" result from allowsMemoryAccess that would be given if using the extra bool argument.

I kind of think the new behavior is more correct in some sense (taking the "Fast" part of the TLI hook into account). But then I guess we want to do that in DAGCombiner::isLegalNarrowLdSt as well (potentially getting even more diffs that looks like "regressions", but that in fact might be optimizations in case unaligned accesses are costly).

; CHECK-LE-NEXT: movne r0, #1
; CHECK-LE-NEXT: mov pc, lr
;
Expand Down Expand Up @@ -444,9 +446,7 @@ define i1 @test_48_17_0(ptr %y) {
;
; CHECK-V7-BE-LABEL: test_48_17_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldr r1, [r0]
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #4]
; CHECK-V7-BE-NEXT: orr r0, r0, r1, lsl #16
; CHECK-V7-BE-NEXT: ldr r0, [r0, #2]
; CHECK-V7-BE-NEXT: bfc r0, #17, #15
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
Expand Down Expand Up @@ -506,15 +506,14 @@ define i1 @test_40_1_32(ptr %y) {
;
; CHECK-BE-LABEL: test_40_1_32:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0]
; CHECK-BE-NEXT: mov r1, #1
; CHECK-BE-NEXT: and r0, r1, r0, lsr #24
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: and r0, r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_40_1_32:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldr r0, [r0]
; CHECK-V7-BE-NEXT: ubfx r0, r0, #24, #1
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
; CHECK-V7-BE-NEXT: and r0, r0, #1
; CHECK-V7-BE-NEXT: bx lr
%a = load i40, ptr %y
%b = and i40 %a, u0x100000000
Expand Down
38 changes: 15 additions & 23 deletions llvm/test/CodeGen/PowerPC/simplifysetcc_narrow_load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-BE-LABEL: test_129_15_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 3, 14(3)
; CHECK-BE-NEXT: lhz 3, 15(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 17
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
Expand Down Expand Up @@ -69,7 +69,7 @@ define i1 @test_33_8_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_8_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 3(3)
; CHECK-BE-NEXT: lbz 3, 4(3)
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand Down Expand Up @@ -105,7 +105,7 @@ define i1 @test_33_1_31(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_31:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: lbz 3, 1(3)
; CHECK-BE-NEXT: srwi 3, 3, 7
; CHECK-BE-NEXT: blr
%a = load i33, ptr %y
Expand All @@ -123,7 +123,7 @@ define i1 @test_33_1_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 3(3)
; CHECK-BE-NEXT: lbz 3, 4(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 31
; CHECK-BE-NEXT: blr
%a = load i33, ptr %y
Expand Down Expand Up @@ -250,12 +250,10 @@ define i1 @test_48_17_0(ptr %y) {
;
; CHECK-BE-LABEL: test_48_17_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 4, 4(3)
; CHECK-BE-NEXT: lwz 3, 0(3)
; CHECK-BE-NEXT: clrlwi 4, 4, 16
; CHECK-BE-NEXT: rlwimi 4, 3, 16, 15, 15
; CHECK-BE-NEXT: addic 3, 4, -1
; CHECK-BE-NEXT: subfe 3, 3, 4
; CHECK-BE-NEXT: lwz 3, 2(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 15
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
%a = load i48, ptr %y
%b = and i48 %a, u0x1ffff
Expand Down Expand Up @@ -292,8 +290,8 @@ define i1 @test_40_1_32(ptr %y) {
;
; CHECK-BE-LABEL: test_40_1_32:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lwz 3, 0(3)
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 31, 31
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 31
; CHECK-BE-NEXT: blr
%a = load i40, ptr %y
%b = and i40 %a, u0x100000000
Expand Down Expand Up @@ -325,15 +323,13 @@ define i1 @test_24_8_8(ptr %y) {
; CHECK-LE-LABEL: test_24_8_8:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lbz 3, 1(3)
; CHECK-LE-NEXT: slwi 3, 3, 8
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_8:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 1(3)
; CHECK-BE-NEXT: slwi 3, 3, 8
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand All @@ -346,18 +342,16 @@ define i1 @test_24_8_8(ptr %y) {
define i1 @test_24_8_12(ptr %y) {
; CHECK-LE-LABEL: test_24_8_12:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lhz 4, 0(3)
; CHECK-LE-NEXT: lbz 3, 2(3)
; CHECK-LE-NEXT: rlwinm 4, 4, 0, 16, 19
; CHECK-LE-NEXT: rlwimi 4, 3, 16, 12, 15
; CHECK-LE-NEXT: addic 3, 4, -1
; CHECK-LE-NEXT: subfe 3, 3, 4
; CHECK-LE-NEXT: lhz 3, 1(3)
; CHECK-LE-NEXT: rlwinm 3, 3, 0, 20, 27
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_12:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 3, 0(3)
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 12, 19
; CHECK-BE-NEXT: rlwinm 3, 3, 0, 20, 27
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand All @@ -371,15 +365,13 @@ define i1 @test_24_8_16(ptr %y) {
; CHECK-LE-LABEL: test_24_8_16:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lbz 3, 2(3)
; CHECK-LE-NEXT: slwi 3, 3, 16
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_16:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: slwi 3, 3, 16
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand Down