From 3e8e3c65bf8c1ecbb956bfc7e26722c27fec59fb Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 6 Jan 2025 11:53:37 +0000 Subject: [PATCH 1/2] Add tests --- llvm/test/CodeGen/AArch64/reduce-or-opt.ll | 193 ++++++++++++++++++ .../CodeGenPrepare/AArch64/reduce-or-opt.ll | 185 +++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/reduce-or-opt.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll new file mode 100644 index 0000000000000..026248dc8c0d5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s + +define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) { +; CHECK-LABEL: select_or_reduce_v2i1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: tbnz w9, #0, .LBB0_3 +; CHECK-NEXT: // %bb.2: // %vector.body +; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: cmp x8, #16 +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: .LBB0_3: // %middle.split +; CHECK-NEXT: and x0, x9, #0x1 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load <2 x ptr>, ptr %arrayidx, align 8 + %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, 2 + %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + %sel = select i1 %or.reduc, i64 1, i64 0 + ret i64 %sel +} + +define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) { +; CHECK-LABEL: br_or_reduce_v2i1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB1_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: tbnz w9, #0, .LBB1_3 +; CHECK-NEXT: // %bb.2: // %vector.body +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: cmp x8, #16 +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: .LBB1_3: // %middle.split +; CHECK-NEXT: tbz w9, #0, .LBB1_5 +; CHECK-NEXT: // %bb.4: // %found +; CHECK-NEXT: mov w8, #56 // =0x38 +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_5: +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load <2 x ptr>, ptr %arrayidx, align 8 + %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, 2 + %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + br i1 %or.reduc, label %found, label %notfound + +found: + store i64 56, ptr %p, align 8 + ret i64 1 + +notfound: + ret i64 0 +} + +define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) { +; CHECK-LABEL: select_or_reduce_nxv2i1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: neg x8, x9 +; CHECK-NEXT: add x11, x8, #4 +; CHECK-NEXT: .LBB2_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: b.ne .LBB2_3 +; CHECK-NEXT: // %bb.2: // %vector.body +; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: cmp x11, x10 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: b.ne .LBB2_1 +; CHECK-NEXT: .LBB2_3: // %middle.split +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 1 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load , ptr %arrayidx, align 8 + %cond = icmp eq %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, %vf + %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1( %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + %sel = select i1 %or.reduc, i64 1, i64 0 + ret i64 %sel +} + +define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) { +; CHECK-LABEL: br_or_reduce_nxv2i1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: neg x10, x8 +; CHECK-NEXT: add x10, x10, #4 +; CHECK-NEXT: .LBB3_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: cset w11, ne +; CHECK-NEXT: b.ne .LBB3_3 +; CHECK-NEXT: // %bb.2: // %vector.body +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: add x9, x9, x8 +; CHECK-NEXT: b.ne .LBB3_1 +; CHECK-NEXT: .LBB3_3: // %middle.split +; CHECK-NEXT: tbz w11, #0, .LBB3_5 +; CHECK-NEXT: // %bb.4: // %found +; CHECK-NEXT: mov w8, #56 // =0x38 +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_5: +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 1 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load , ptr %arrayidx, align 8 + %cond = icmp eq %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, %vf + %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1( %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + br i1 %or.reduc, label %found, label %notfound + +found: + store i64 56, ptr %p, align 8 + ret i64 1 + +notfound: + ret i64 0 +} + +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.or.nxv2i1() diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll new file mode 100644 index 0000000000000..99fe6b008ebbb --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s + +define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) { +; CHECK-LABEL: define i64 @select_or_reduce_v2i1( +; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]]) +; CHECK-NEXT: [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0 +; CHECK-NEXT: ret i64 [[SEL]] +; +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load <2 x ptr>, ptr %arrayidx, align 8 + %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, 2 + %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + %sel = select i1 %or.reduc, i64 1, i64 0 + ret i64 %sel +} + +define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) { +; CHECK-LABEL: define i64 @br_or_reduce_v2i1( +; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]]) +; CHECK-NEXT: [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]] +; CHECK: [[FOUND]]: +; CHECK-NEXT: store i64 56, ptr [[P]], align 8 +; CHECK-NEXT: ret i64 1 +; CHECK: [[NOTFOUND]]: +; CHECK-NEXT: ret i64 0 +; +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load <2 x ptr>, ptr %arrayidx, align 8 + %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, 2 + %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + br i1 %or.reduc, label %found, label %notfound + +found: + store i64 56, ptr %p, align 8 + ret i64 1 + +notfound: + ret i64 0 +} + +define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) { +; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1( +; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[COND:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1( [[COND]]) +; CHECK-NEXT: [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0 +; CHECK-NEXT: ret i64 [[SEL]] +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 1 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load , ptr %arrayidx, align 8 + %cond = icmp eq %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, %vf + %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1( %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + %sel = select i1 %or.reduc, i64 1, i64 0 + ret i64 %sel +} + +define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) { +; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1( +; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[COND:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1( [[COND]]) +; CHECK-NEXT: [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]] +; CHECK: [[FOUND]]: +; CHECK-NEXT: store i64 56, ptr [[P]], align 8 +; CHECK-NEXT: ret i64 1 +; CHECK: [[NOTFOUND]]: +; CHECK-NEXT: ret i64 0 +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 1 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index + %wide.load = load , ptr %arrayidx, align 8 + %cond = icmp eq %wide.load, splat(ptr zeroinitializer) + %index.next = add nuw i64 %index, %vf + %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1( %cond) + %iv.cmp = icmp eq i64 %index.next, 4 + %exit.cond = or i1 %or.reduc, %iv.cmp + br i1 %exit.cond, label %middle.split, label %vector.body + +middle.split: + br i1 %or.reduc, label %found, label %notfound + +found: + store i64 56, ptr %p, align 8 + ret i64 1 + +notfound: + ret i64 0 +} + +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.or.nxv2i1() From 04f0aa7172b1f70834a4d4d228456b5f95e85b48 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 6 Jan 2025 11:53:46 +0000 Subject: [PATCH 2/2] [AArch64] Improve codegen of vectorised early exit loops Once PR #112138 lands we are able to start vectorising more loops that have uncountable early exits. The typical loop structure looks like this: vector.body: ... %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat ... %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred) %iv.cmp = icmp eq i64 %index.next, 4 %exit.cond = or i1 %or.reduc, %iv.cmp br i1 %exit.cond, label %middle.split, label %vector.body middle.split: br i1 %or.reduc, label %found, label %notfound found: ret i64 1 notfound: ret i64 0 The problem with this is that %or.reduc is kept live after the loop, and since this is a boolean it typically requires making a copy of the condition code register. For AArch64 this requires an additional cset instruction, which is quite expensive for a typical find loop that only contains 6 or 7 instructions. This patch attempts to improve the codegen by sinking the reduction out of the loop to the location of it's user. It's a lot cheaper to keep the predicate alive if the type is legal and has lots of registers for it. There is a potential downside in that a little more work is required after the loop, but I believe this is worth it since we are likely to spend most of our time in the loop. --- .../AArch64/AArch64TargetTransformInfo.cpp | 25 ++++++++++++++++++- llvm/test/CodeGen/AArch64/reduce-or-opt.ll | 22 ++++++++-------- .../CodeGenPrepare/AArch64/reduce-or-opt.ll | 8 ++++-- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7c5e5336b6531..88dbc60cc4bd5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5290,11 +5290,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } } - // Sink vscales closer to uses for better isel + auto ShouldSinkCondition = [](Value *Cond) -> bool { + auto *II = dyn_cast(Cond); + return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && + isa(II->getOperand(0)->getType()); + }; + switch (I->getOpcode()) { case Instruction::GetElementPtr: case Instruction::Add: case Instruction::Sub: + // Sink vscales closer to uses for better isel for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { if (shouldSinkVScale(I->getOperand(Op), Ops)) { Ops.push_back(&I->getOperandUse(Op)); @@ -5302,6 +5308,23 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( } } break; + case Instruction::Select: { + if (!ShouldSinkCondition(I->getOperand(0))) + return false; + + Ops.push_back(&I->getOperandUse(0)); + return true; + } + case Instruction::Br: { + if (cast(I)->isUnconditional()) + return false; + + if (!ShouldSinkCondition(cast(I)->getCondition())) + return false; + + Ops.push_back(&I->getOperandUse(0)); + return true; + } default: break; } diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll index 026248dc8c0d5..f5df5ea53c990 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll @@ -93,24 +93,24 @@ notfound: define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) { ; CHECK-LABEL: select_or_reduce_nxv2i1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x10, xzr -; CHECK-NEXT: neg x8, x9 -; CHECK-NEXT: add x11, x8, #4 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: neg x10, x8 +; CHECK-NEXT: add x10, x10, #4 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: b.ne .LBB2_3 ; CHECK-NEXT: // %bb.2: // %vector.body ; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 -; CHECK-NEXT: cmp x11, x10 -; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: add x9, x9, x8 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: .LBB2_3: // %middle.split -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ptest p0, p1.b +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() @@ -145,7 +145,6 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: cset w11, ne ; CHECK-NEXT: b.ne .LBB3_3 ; CHECK-NEXT: // %bb.2: // %vector.body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 @@ -153,7 +152,8 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef ; CHECK-NEXT: add x9, x9, x8 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: .LBB3_3: // %middle.split -; CHECK-NEXT: tbz w11, #0, .LBB3_5 +; CHECK-NEXT: ptest p0, p1.b +; CHECK-NEXT: b.eq .LBB3_5 ; CHECK-NEXT: // %bb.4: // %found ; CHECK-NEXT: mov w8, #56 // =0x38 ; CHECK-NEXT: mov w0, #1 // =0x1 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll index 99fe6b008ebbb..52257c10b0bf6 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll @@ -105,7 +105,9 @@ define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) { ; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] ; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] ; CHECK: [[MIDDLE_SPLIT]]: -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1( [[TMP2]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0 ; CHECK-NEXT: ret i64 [[SEL]] ; entry: @@ -147,7 +149,9 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef ; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]] ; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]] ; CHECK: [[MIDDLE_SPLIT]]: -; CHECK-NEXT: br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1( [[TMP2]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]] ; CHECK: [[FOUND]]: ; CHECK-NEXT: store i64 56, ptr [[P]], align 8 ; CHECK-NEXT: ret i64 1