From 879452579eaefdf311997b13894f7382a4bab649 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Tue, 31 Oct 2023 10:07:55 +0000 Subject: [PATCH 1/7] Pre-commit tests --- .../AArch64/machine-licm-hoist-load.ll | 428 ++++++++++++++++++ 1 file changed, 428 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll new file mode 100644 index 0000000000000..ec06bddc60c85 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll @@ -0,0 +1,428 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: one_dimensional: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cbz x2, .LBB0_2 +; CHECK-NEXT: .LBB0_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x9, [x0], #8 +; CHECK-NEXT: ldr w10, [x1] +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: subs x2, x2, #1 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: .LBB0_2: // %for.cond.cleanup +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +entry: + %cmp4 = icmp eq i64 %N, 0 + br i1 %cmp4, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.body ] + ret i64 %sum.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06 + %0 = load ptr, ptr %arrayidx, align 8 + %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) + %tobool = icmp ne i32 %bcmp, 0 + %add = zext i1 %tobool to i64 + %spec.select = add i64 %sum.05, %add + %inc = add nuw i64 %i.06, 1 + %exitcond = icmp eq i64 %inc, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: two_dimensional: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cbz x2, .LBB1_6 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: cbz x3, .LBB1_6 +; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB1_4 Depth 2 +; CHECK-NEXT: ldr x10, [x0, x9, lsl #3] +; CHECK-NEXT: mov x11, x3 +; CHECK-NEXT: .LBB1_4: // %for.body4 +; CHECK-NEXT: // Parent Loop BB1_3 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldr x12, [x10], #8 +; CHECK-NEXT: ldr w13, [x1] +; CHECK-NEXT: ldr w12, [x12] +; CHECK-NEXT: cmp w12, w13 +; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: subs x11, x11, #1 +; CHECK-NEXT: b.ne .LBB1_4 +; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge +; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: cmp x9, x2 +; CHECK-NEXT: b.ne .LBB1_3 +; CHECK-NEXT: .LBB1_6: // %for.cond.cleanup +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +entry: + %cmp17 = icmp eq i64 %N, 0 + %cmp214 = icmp eq i64 %M, 0 + %or.cond = or i1 %cmp17, %cmp214 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge + %i.019 = phi i64 [ %inc7, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ] + %sum.018 = phi i64 [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019 + %0 = load ptr, ptr %arrayidx, align 8 + br label %for.body4 + +for.body4: ; preds = %for.cond1.preheader, %for.body4 + %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ] + %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ] + %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016 + %1 = load ptr, ptr %arrayidx5, align 8 + %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4) + %tobool = icmp ne i32 %bcmp, 0 + %add = zext i1 %tobool to i64 + %spec.select = add i64 %sum.115, %add + %inc = add nuw i64 %j.016, 1 + %exitcond = icmp eq i64 %inc, %M + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge, label %for.body4 + +for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4 + %inc7 = add nuw i64 %i.019, 1 + %exitcond22 = icmp eq i64 %inc7, %N + br i1 %exitcond22, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %entry + %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ] + ret i64 %sum.0.lcssa +} + +define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: three_dimensional_middle: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: cbz x2, .LBB2_9 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: cbz x3, .LBB2_9 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: cbz x4, .LBB2_9 +; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: .LBB2_4: // %for.cond1.preheader +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB2_5 Depth 2 +; CHECK-NEXT: // Child Loop BB2_6 Depth 3 +; CHECK-NEXT: ldr x10, [x8, x9, lsl #3] +; CHECK-NEXT: mov x11, xzr +; CHECK-NEXT: .LBB2_5: // %for.cond5.preheader +; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 +; CHECK-NEXT: // => This Loop Header: Depth=2 +; CHECK-NEXT: // Child Loop BB2_6 Depth 3 +; CHECK-NEXT: lsl x13, x11, #3 +; CHECK-NEXT: mov x14, x4 +; CHECK-NEXT: ldr x12, [x10, x13] +; CHECK-NEXT: ldr x13, [x1, x13] +; CHECK-NEXT: .LBB2_6: // %for.body8 +; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 +; CHECK-NEXT: // Parent Loop BB2_5 Depth=2 +; CHECK-NEXT: // => This Inner Loop Header: Depth=3 +; CHECK-NEXT: ldr x15, [x12], #8 +; CHECK-NEXT: ldr w16, [x13] +; CHECK-NEXT: ldr w15, [x15] +; CHECK-NEXT: cmp w15, w16 +; CHECK-NEXT: cinc x0, x0, ne +; CHECK-NEXT: subs x14, x14, #1 +; CHECK-NEXT: b.ne .LBB2_6 +; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond +; CHECK-NEXT: // in Loop: Header=BB2_5 Depth=2 +; CHECK-NEXT: add x11, x11, #1 +; CHECK-NEXT: cmp x11, x3 +; CHECK-NEXT: b.ne .LBB2_5 +; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond +; CHECK-NEXT: // in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: cmp x9, x2 +; CHECK-NEXT: b.ne .LBB2_4 +; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %cmp33 = icmp eq i64 %N, 0 + %cmp229 = icmp eq i64 %M, 0 + %or.cond = or i1 %cmp33, %cmp229 + %cmp626 = icmp eq i64 %K, 0 + %or.cond48 = or i1 %or.cond, %cmp626 + br i1 %or.cond48, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond + %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ] + %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035 + %0 = load ptr, ptr %arrayidx, align 8 + br label %for.cond5.preheader + +for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader + %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ] + %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] + %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031 + %1 = load ptr, ptr %arrayidx9, align 8 + %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031 + %2 = load ptr, ptr %arrayidx11, align 8 + br label %for.body8 + +for.body8: ; preds = %for.body8, %for.cond5.preheader + %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] + %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ] + %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028 + %3 = load ptr, ptr %arrayidx10, align 8 + %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4) + %tobool = icmp ne i32 %bcmp, 0 + %add = zext i1 %tobool to i64 + %spec.select = add i64 %sum.227, %add + %inc = add nuw i64 %k.028, 1 + %exitcond = icmp eq i64 %inc, %K + br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 + +for.cond5.for.cond: ; preds = %for.body8 + %inc13 = add nuw i64 %j.031, 1 + %exitcond46 = icmp eq i64 %inc13, %M + br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader + +for.cond1.for.cond: ; preds = %for.cond5.for.cond + %inc16 = add nuw i64 %i.035, 1 + %exitcond47 = icmp eq i64 %inc16, %N + br i1 %exitcond47, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry + %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ] + ret i64 %sum.0.lcssa +} + +define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: three_dimensional: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: cbz x2, .LBB3_9 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: cbz x3, .LBB3_9 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: cbz x4, .LBB3_9 +; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB3_5 Depth 2 +; CHECK-NEXT: // Child Loop BB3_6 Depth 3 +; CHECK-NEXT: ldr x10, [x8, x9, lsl #3] +; CHECK-NEXT: mov x11, xzr +; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader +; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 +; CHECK-NEXT: // => This Loop Header: Depth=2 +; CHECK-NEXT: // Child Loop BB3_6 Depth 3 +; CHECK-NEXT: ldr x12, [x10, x11, lsl #3] +; CHECK-NEXT: mov x13, x4 +; CHECK-NEXT: .LBB3_6: // %for.body8 +; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 +; CHECK-NEXT: // Parent Loop BB3_5 Depth=2 +; CHECK-NEXT: // => This Inner Loop Header: Depth=3 +; CHECK-NEXT: ldr x14, [x12], #8 +; CHECK-NEXT: ldr w15, [x1] +; CHECK-NEXT: ldr w14, [x14] +; CHECK-NEXT: cmp w14, w15 +; CHECK-NEXT: cinc x0, x0, ne +; CHECK-NEXT: subs x13, x13, #1 +; CHECK-NEXT: b.ne .LBB3_6 +; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond +; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2 +; CHECK-NEXT: add x11, x11, #1 +; CHECK-NEXT: cmp x11, x3 +; CHECK-NEXT: b.ne .LBB3_5 +; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond +; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: cmp x9, x2 +; CHECK-NEXT: b.ne .LBB3_4 +; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %cmp31 = icmp eq i64 %N, 0 + %cmp227 = icmp eq i64 %M, 0 + %or.cond = or i1 %cmp31, %cmp227 + %cmp624 = icmp eq i64 %K, 0 + %or.cond46 = or i1 %or.cond, %cmp624 + br i1 %or.cond46, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond + %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ] + %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033 + %0 = load ptr, ptr %arrayidx, align 8 + br label %for.cond5.preheader + +for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader + %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ] + %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] + %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029 + %1 = load ptr, ptr %arrayidx9, align 8 + br label %for.body8 + +for.body8: ; preds = %for.body8, %for.cond5.preheader + %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] + %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ] + %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026 + %2 = load ptr, ptr %arrayidx10, align 8 + %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4) + %tobool = icmp ne i32 %bcmp, 0 + %add = zext i1 %tobool to i64 + %spec.select = add i64 %sum.225, %add + %inc = add nuw i64 %k.026, 1 + %exitcond = icmp eq i64 %inc, %K + br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 + +for.cond5.for.cond: ; preds = %for.body8 + %inc12 = add nuw i64 %j.029, 1 + %exitcond44 = icmp eq i64 %inc12, %M + br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader + +for.cond1.for.cond: ; preds = %for.cond5.for.cond + %inc15 = add nuw i64 %i.033, 1 + %exitcond45 = icmp eq i64 %inc15, %N + br i1 %exitcond45, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry + %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ] + ret i64 %sum.0.lcssa +} + +define i32 @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N, i32 %M, i32 %K) { +; CHECK-LABEL: one_dimensional_with_store: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB4_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: .LBB4_2: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x9, [x0], #8 +; CHECK-NEXT: ldr w10, [x1] +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: rev w10, w10 +; CHECK-NEXT: rev w9, w9 +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: cset w9, hi +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: sub w9, w9, w10 +; CHECK-NEXT: strb w9, [x2], #1 +; CHECK-NEXT: b.ne .LBB4_2 +; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret i32 0 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv + %0 = load ptr, ptr %arrayidx, align 8 + %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4) + %conv = trunc i32 %call to i8 + %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv + store i8 %conv, ptr %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N, i32 %M, i32 %K) { +; CHECK-LABEL: one_dimensional_with_call: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: cmp w2, #1 +; CHECK-NEXT: b.lt .LBB5_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov w20, wzr +; CHECK-NEXT: mov w22, w2 +; CHECK-NEXT: .LBB5_2: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x8, [x21], #8 +; CHECK-NEXT: ldr w9, [x19] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cinc w20, w20, ne +; CHECK-NEXT: bl func +; CHECK-NEXT: subs x22, x22, #1 +; CHECK-NEXT: b.ne .LBB5_2 +; CHECK-NEXT: b .LBB5_4 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: mov w20, wzr +; CHECK-NEXT: .LBB5_4: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %cmp4 = icmp sgt i32 %N, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv + %0 = load ptr, ptr %arrayidx, align 8 + %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) + %tobool.not = icmp ne i32 %bcmp, 0 + %add = zext i1 %tobool.not to i32 + %spec.select = add nuw nsw i32 %sum.05, %add + tail call void @func() + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + + +declare i32 @bcmp(ptr, ptr, i64) +declare i32 @memcmp(ptr, ptr, i64) +declare void @func() From c5794882ec7043b4257431d2cdcda857e94d97b4 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Tue, 31 Oct 2023 10:59:36 +0000 Subject: [PATCH 2/7] [MachineLICM] Allow hoisting loads from invariant address Sometimes, loads can appear in a loop after the LICM pass is executed the final time. For example, ExpandMemCmp pass creates loads in a loop, and one of the operands may be an invariant address. This patch extends the pre-regalloc stage MachineLICM by allowing to hoist invariant loads from loops that don't have any stores or calls and allows load reorderings. --- llvm/lib/CodeGen/MachineLICM.cpp | 79 +++++++++++++++---- .../AArch64/machine-licm-hoist-load.ll | 75 ++++++++++-------- .../AArch64/ragreedy-local-interval-cost.ll | 2 +- llvm/test/CodeGen/AArch64/sinksplat.ll | 2 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 4 +- .../RISCV/rvv/fold-scalar-load-crash.ll | 48 +++++------ .../CodeGen/X86/2009-04-25-CoalescerBug.ll | 7 +- llvm/test/CodeGen/X86/block-placement.ll | 3 +- llvm/test/CodeGen/X86/fma-commute-loop.ll | 2 +- llvm/test/CodeGen/X86/pr49393.ll | 15 ++-- llvm/test/CodeGen/X86/pr53842.ll | 14 ++-- 11 files changed, 154 insertions(+), 97 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index e29f28ecaea0d..f1af74328f002 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -72,6 +72,11 @@ static cl::opt HoistConstStores("hoist-const-stores", cl::desc("Hoist invariant stores"), cl::init(true), cl::Hidden); + +static cl::opt HoistConstLoads("hoist-const-loads", + cl::desc("Hoist invariant loads"), + cl::init(true), cl::Hidden); + // The default threshold of 100 (i.e. if target block is 100 times hotter) // is based on empirical data on a single target and is subject to tuning. static cl::opt @@ -222,9 +227,11 @@ namespace { void AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop); - bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop); + bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop, + bool SafeToMoveLoad); - bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop); + bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop, + bool SafeToMoveLoad); bool HasLoopPHIUse(const MachineInstr *MI, MachineLoop *CurLoop); @@ -277,7 +284,7 @@ namespace { bool MayCSE(MachineInstr *MI); unsigned Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, - MachineLoop *CurLoop); + MachineLoop *CurLoop, bool SafeToMoveLoad); void InitCSEMap(MachineBasicBlock *BB); @@ -494,7 +501,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // operands. FIXME: Consider unfold load folding instructions. if (Def && !RuledOut) { int FI = std::numeric_limits::min(); - if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop)) || + if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop, false)) || (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI))) Candidates.push_back(CandidateInfo(MI, Def, FI)); } @@ -772,6 +779,32 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, BackTrace.clear(); InitRegPressure(Preheader); + // Compute information about whether it is allowed to move load instruction + // out of the current loop or one of the inner loops + SmallDenseMap AllowedToHoistLoads; + if (HoistConstLoads) { + SmallVector Worklist{CurLoop}; + + while (!Worklist.empty()) { + auto *L = Worklist.pop_back_val(); + AllowedToHoistLoads[L] = true; + Worklist.insert(Worklist.end(), L->getSubLoops().begin(), + L->getSubLoops().end()); + } + + for (auto *MBB : CurLoop->blocks()) { + for (auto &MI : *MBB) { + if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) { + for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop; + L = L->getParentLoop()) + AllowedToHoistLoads[L] = false; + AllowedToHoistLoads[CurLoop] = false; + break; + } + } + } + } + // Now perform LICM. for (MachineDomTreeNode *Node : Scopes) { MachineBasicBlock *MBB = Node->getBlock(); @@ -780,9 +813,23 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, // Process the block SpeculationState = SpeculateUnknown; + + auto CanMoveLoad = [](MachineLoop *L) -> bool { + dbgs() << L << "\n"; + for (auto *MBB : L->blocks()) { + for (auto &MI : *MBB) { + // Taken from MachineInstr::isSafeToMove + if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) + return false; + } + } + return true; + }; + + bool SafeToMoveLoad = HoistConstLoads && AllowedToHoistLoads[CurLoop]; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { unsigned HoistRes = HoistResult::NotHoisted; - HoistRes = Hoist(&MI, Preheader, CurLoop); + HoistRes = Hoist(&MI, Preheader, CurLoop, SafeToMoveLoad); if (HoistRes & HoistResult::NotHoisted) { // We have failed to hoist MI to outermost loop's preheader. If MI is in // a subloop, try to hoist it to subloop's preheader. @@ -793,9 +840,12 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, while (!InnerLoopWorkList.empty()) { MachineLoop *InnerLoop = InnerLoopWorkList.pop_back_val(); + bool SafeToMoveLoadInner = + HoistConstLoads && AllowedToHoistLoads[InnerLoop]; MachineBasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); if (InnerLoopPreheader) { - HoistRes = Hoist(&MI, InnerLoopPreheader, InnerLoop); + HoistRes = + Hoist(&MI, InnerLoopPreheader, InnerLoop, SafeToMoveLoadInner); if (HoistRes & HoistResult::Hoisted) break; } @@ -990,9 +1040,10 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI, /// Returns true if the instruction may be a suitable candidate for LICM. /// e.g. If the instruction is a call, then it's obviously not safe to hoist it. -bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) { +bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop, + bool SafeToMoveLoad) { // Check if it's safe to move the instruction. - bool DontMoveAcrossStore = true; + bool DontMoveAcrossStore = !SafeToMoveLoad; if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) && !(HoistConstStores && isInvariantStore(I, TRI, MRI))) { LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n"); @@ -1025,9 +1076,9 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) { } /// Returns true if the instruction is loop invariant. -bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, - MachineLoop *CurLoop) { - if (!IsLICMCandidate(I, CurLoop)) { +bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop, + bool SafeToMoveLoad) { + if (!IsLICMCandidate(I, CurLoop, SafeToMoveLoad)) { LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n"); return false; } @@ -1305,7 +1356,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI, MBB->insert(Pos, NewMIs[1]); // If unfolding produced a load that wasn't loop-invariant or profitable to // hoist, discard the new instructions and bail. - if (!IsLoopInvariantInst(*NewMIs[0], CurLoop) || + if (!IsLoopInvariantInst(*NewMIs[0], CurLoop, /*SaveToMovLoad=*/false) || !IsProfitableToHoist(*NewMIs[0], CurLoop)) { NewMIs[0]->eraseFromParent(); NewMIs[1]->eraseFromParent(); @@ -1432,7 +1483,7 @@ bool MachineLICMBase::MayCSE(MachineInstr *MI) { /// that are safe to hoist, this instruction is called to do the dirty work. /// It returns true if the instruction is hoisted. unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, - MachineLoop *CurLoop) { + MachineLoop *CurLoop, bool SafeToMoveLoad) { MachineBasicBlock *SrcBlock = MI->getParent(); // Disable the instruction hoisting due to block hotness @@ -1444,7 +1495,7 @@ unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, } // First check whether we should hoist this instruction. bool HasExtractHoistableLoad = false; - if (!IsLoopInvariantInst(*MI, CurLoop) || + if (!IsLoopInvariantInst(*MI, CurLoop, SafeToMoveLoad) || !IsProfitableToHoist(*MI, CurLoop)) { // If not, try unfolding a hoistable load. MI = ExtractHoistableLoad(MI, CurLoop); diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll index ec06bddc60c85..6b76b03fe00fc 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll @@ -4,18 +4,23 @@ define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-LABEL: one_dimensional: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbz x2, .LBB0_4 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: cbz x2, .LBB0_2 -; CHECK-NEXT: .LBB0_1: // %for.body +; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr x9, [x0], #8 -; CHECK-NEXT: ldr w10, [x1] -; CHECK-NEXT: ldr w9, [x9] -; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: ldr x10, [x0], #8 +; CHECK-NEXT: ldr w10, [x10] +; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cinc x8, x8, ne ; CHECK-NEXT: subs x2, x2, #1 -; CHECK-NEXT: b.ne .LBB0_1 -; CHECK-NEXT: .LBB0_2: // %for.cond.cleanup +; CHECK-NEXT: b.ne .LBB0_2 +; CHECK-NEXT: // %bb.3: // %for.cond.cleanup +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: @@ -48,22 +53,22 @@ define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: cbz x3, .LBB1_6 ; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader +; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB1_4 Depth 2 -; CHECK-NEXT: ldr x10, [x0, x9, lsl #3] -; CHECK-NEXT: mov x11, x3 +; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] +; CHECK-NEXT: mov x12, x3 ; CHECK-NEXT: .LBB1_4: // %for.body4 ; CHECK-NEXT: // Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr x12, [x10], #8 -; CHECK-NEXT: ldr w13, [x1] -; CHECK-NEXT: ldr w12, [x12] -; CHECK-NEXT: cmp w12, w13 +; CHECK-NEXT: ldr x13, [x11], #8 +; CHECK-NEXT: ldr w13, [x13] +; CHECK-NEXT: cmp w13, w10 ; CHECK-NEXT: cinc x8, x8, ne -; CHECK-NEXT: subs x11, x11, #1 +; CHECK-NEXT: subs x12, x12, #1 ; CHECK-NEXT: b.ne .LBB1_4 ; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge ; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 @@ -132,18 +137,18 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 ; CHECK-NEXT: // Child Loop BB2_6 Depth 3 -; CHECK-NEXT: lsl x13, x11, #3 +; CHECK-NEXT: lsl x12, x11, #3 ; CHECK-NEXT: mov x14, x4 -; CHECK-NEXT: ldr x12, [x10, x13] -; CHECK-NEXT: ldr x13, [x1, x13] +; CHECK-NEXT: ldr x13, [x1, x12] +; CHECK-NEXT: ldr x12, [x10, x12] +; CHECK-NEXT: ldr w13, [x13] ; CHECK-NEXT: .LBB2_6: // %for.body8 ; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 ; CHECK-NEXT: // Parent Loop BB2_5 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 ; CHECK-NEXT: ldr x15, [x12], #8 -; CHECK-NEXT: ldr w16, [x13] ; CHECK-NEXT: ldr w15, [x15] -; CHECK-NEXT: cmp w15, w16 +; CHECK-NEXT: cmp w15, w13 ; CHECK-NEXT: cinc x0, x0, ne ; CHECK-NEXT: subs x14, x14, #1 ; CHECK-NEXT: b.ne .LBB2_6 @@ -214,43 +219,42 @@ for.cond.cleanup: ; preds = %for.cond1.for.cond, define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-LABEL: three_dimensional: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: cbz x2, .LBB3_9 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: cbz x3, .LBB3_9 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: cbz x4, .LBB3_9 ; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader +; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB3_5 Depth 2 ; CHECK-NEXT: // Child Loop BB3_6 Depth 3 -; CHECK-NEXT: ldr x10, [x8, x9, lsl #3] -; CHECK-NEXT: mov x11, xzr +; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] +; CHECK-NEXT: mov x12, xzr ; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader ; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 ; CHECK-NEXT: // Child Loop BB3_6 Depth 3 -; CHECK-NEXT: ldr x12, [x10, x11, lsl #3] -; CHECK-NEXT: mov x13, x4 +; CHECK-NEXT: ldr x13, [x11, x12, lsl #3] +; CHECK-NEXT: mov x14, x4 ; CHECK-NEXT: .LBB3_6: // %for.body8 ; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 ; CHECK-NEXT: // Parent Loop BB3_5 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldr x14, [x12], #8 -; CHECK-NEXT: ldr w15, [x1] -; CHECK-NEXT: ldr w14, [x14] -; CHECK-NEXT: cmp w14, w15 -; CHECK-NEXT: cinc x0, x0, ne -; CHECK-NEXT: subs x13, x13, #1 +; CHECK-NEXT: ldr x15, [x13], #8 +; CHECK-NEXT: ldr w15, [x15] +; CHECK-NEXT: cmp w15, w10 +; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: subs x14, x14, #1 ; CHECK-NEXT: b.ne .LBB3_6 ; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond ; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2 -; CHECK-NEXT: add x11, x11, #1 -; CHECK-NEXT: cmp x11, x3 +; CHECK-NEXT: add x12, x12, #1 +; CHECK-NEXT: cmp x12, x3 ; CHECK-NEXT: b.ne .LBB3_5 ; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond ; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 @@ -258,6 +262,7 @@ define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { ; CHECK-NEXT: cmp x9, x2 ; CHECK-NEXT: b.ne .LBB3_4 ; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: %cmp31 = icmp eq i64 %N, 0 diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index 419f25c22eb72..178336870373e 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -hoist-const-loads=false < %s | FileCheck %s @A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 @B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index ca51c7c85d2c9..cb63a4d78dc25 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -hoist-const-loads=false -o - %s | FileCheck %s define <4 x i32> @smull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: smull: diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index f24abb5684000..e1f1f5495e9af 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck --check-prefix=CHECK-BE %s ; CHECK-LABEL: lCPI0_0: ; CHECK-NEXT: .byte 0 ; 0x0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll index 96fdbfc6d0974..79b1e14b774a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll @@ -7,49 +7,49 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; RV32-LABEL: test: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi a3, a2, 1 +; RV32-NEXT: th.lbib a3, (a1), -1, 0 +; RV32-NEXT: th.lrb a0, a1, a0, 0 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a1, a2, 1 ; RV32-NEXT: .LBB0_1: # %for.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: mv a4, a1 -; RV32-NEXT: th.lbib a5, (a4), -1, 0 -; RV32-NEXT: th.lrb a4, a4, a0, 0 -; RV32-NEXT: vmv.v.x v8, a5 ; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma -; RV32-NEXT: vslideup.vx v8, v9, a2 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vslideup.vx v10, v9, a2 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, tu, ma -; RV32-NEXT: vmv.s.x v8, a4 +; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32-NEXT: vmseq.vi v8, v8, 0 -; RV32-NEXT: vmv.x.s a4, v8 -; RV32-NEXT: andi a4, a4, 255 -; RV32-NEXT: bnez a4, .LBB0_1 +; RV32-NEXT: vmseq.vi v9, v10, 0 +; RV32-NEXT: vmv.x.s a3, v9 +; RV32-NEXT: andi a3, a3, 255 +; RV32-NEXT: bnez a3, .LBB0_1 ; RV32-NEXT: # %bb.2: # %if.then381 ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test: ; RV64: # %bb.0: # %entry +; RV64-NEXT: th.lbib a3, (a1), -1, 0 ; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: addi a3, a2, 1 +; RV64-NEXT: th.lrb a0, a1, a0, 0 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v8, a3 +; RV64-NEXT: addi a1, a2, 1 ; RV64-NEXT: .LBB0_1: # %for.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NEXT: mv a4, a1 -; RV64-NEXT: th.lbib a5, (a4), -1, 0 -; RV64-NEXT: th.lrb a4, a4, a0, 0 -; RV64-NEXT: vmv.v.x v8, a5 ; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma -; RV64-NEXT: vslideup.vx v8, v9, a2 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vslideup.vx v10, v9, a2 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, tu, ma -; RV64-NEXT: vmv.s.x v8, a4 +; RV64-NEXT: vmv.s.x v10, a0 ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV64-NEXT: vmseq.vi v8, v8, 0 -; RV64-NEXT: vmv.x.s a4, v8 -; RV64-NEXT: andi a4, a4, 255 -; RV64-NEXT: bnez a4, .LBB0_1 +; RV64-NEXT: vmseq.vi v9, v10, 0 +; RV64-NEXT: vmv.x.s a3, v9 +; RV64-NEXT: andi a3, a3, 255 +; RV64-NEXT: bnez a3, .LBB0_1 ; RV64-NEXT: # %bb.2: # %if.then381 ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll index 8494d87e1e0f2..ce28893090c43 100644 --- a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll @@ -5,14 +5,15 @@ define i64 @test(ptr %tmp13) nounwind { ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl (%rdi), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl %eax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %while.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %while.end -; CHECK-NEXT: shrl %eax ; CHECK-NEXT: retq entry: br label %while.cond diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll index a522f0e9828a0..2f9635db34a33 100644 --- a/llvm/test/CodeGen/X86/block-placement.ll +++ b/llvm/test/CodeGen/X86/block-placement.ll @@ -318,8 +318,7 @@ define void @unnatural_cfg1() { ; CHECK-LABEL: unnatural_cfg1 ; CHECK: %entry ; CHECK: %loop.header -; CHECK: %loop.body2 -; CHECK: %loop.body3 +; CHECK: %loop.body5 entry: br label %loop.header diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll index 833137fa6cd6d..a22e5d2e5e0c3 100644 --- a/llvm/test/CodeGen/X86/fma-commute-loop.ll +++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f -hoist-const-loads=false | FileCheck %s define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind { ; CHECK-LABEL: eggs: diff --git a/llvm/test/CodeGen/X86/pr49393.ll b/llvm/test/CodeGen/X86/pr49393.ll index f7bc71d29b07b..3fb6a82f3fbb2 100644 --- a/llvm/test/CodeGen/X86/pr49393.ll +++ b/llvm/test/CodeGen/X86/pr49393.ll @@ -5,14 +5,14 @@ define void @f() { ; CHECK-LABEL: f: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_1: # %for.cond -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: imull %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: mulsd %xmm0, %xmm1 ; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_1: # %for.cond +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: imull %eax, %eax ; CHECK-NEXT: cwtl ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2sd %eax, %xmm2 @@ -21,9 +21,10 @@ define void @f() { ; CHECK-NEXT: movapd %xmm2, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm0, %xmm2 -; CHECK-NEXT: subsd %xmm3, %xmm1 -; CHECK-NEXT: addsd %xmm2, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %eax +; CHECK-NEXT: movapd %xmm1, %xmm4 +; CHECK-NEXT: subsd %xmm3, %xmm4 +; CHECK-NEXT: addsd %xmm2, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %eax ; CHECK-NEXT: jmp .LBB0_1 entry: br label %for.cond diff --git a/llvm/test/CodeGen/X86/pr53842.ll b/llvm/test/CodeGen/X86/pr53842.ll index 4a3f751eb1d3c..89f04e3373ae8 100644 --- a/llvm/test/CodeGen/X86/pr53842.ll +++ b/llvm/test/CodeGen/X86/pr53842.ll @@ -8,16 +8,16 @@ define void @PR53842() { ; CHECK-LABEL: PR53842: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm3, %ymm3 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %ymm2 -; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; CHECK-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm4 +; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; CHECK-NEXT: vpsubq %zmm4, %zmm0, %zmm0 ; CHECK-NEXT: jmp .LBB0_1 entry: br label %vector.body From 5a815acf43fbf7e823ac12e60ddbe5cee53c5f0e Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Tue, 31 Oct 2023 13:32:27 +0000 Subject: [PATCH 3/7] Remove rudiment and apply formating --- llvm/lib/CodeGen/MachineLICM.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index f1af74328f002..ef4a9f80549e9 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -794,7 +794,8 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, for (auto *MBB : CurLoop->blocks()) { for (auto &MI : *MBB) { - if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) { + if (MI.mayStore() || MI.isCall() || + (MI.mayLoad() && MI.hasOrderedMemoryRef())) { for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop; L = L->getParentLoop()) AllowedToHoistLoads[L] = false; @@ -813,19 +814,6 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, // Process the block SpeculationState = SpeculateUnknown; - - auto CanMoveLoad = [](MachineLoop *L) -> bool { - dbgs() << L << "\n"; - for (auto *MBB : L->blocks()) { - for (auto &MI : *MBB) { - // Taken from MachineInstr::isSafeToMove - if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) - return false; - } - } - return true; - }; - bool SafeToMoveLoad = HoistConstLoads && AllowedToHoistLoads[CurLoop]; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { unsigned HoistRes = HoistResult::NotHoisted; From b98af963f046529931f82ce35e00233b593a02d4 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Wed, 1 Nov 2023 15:07:21 +0000 Subject: [PATCH 4/7] Address the comments and fix failing tests AArch64/machine-licm-hoist-load.ll - added memcmp-sized 6 test, comments, removed irrelevant lines and from AArch64/machine-licm-hoist-load.ll AArch64/sinksplat.ll, AArch64/zext-to-tbl.ll - regenerated check lines without `-hoist-const-loads=false` Adjusted tests after seeing that manually hoisted load results in the same assembly as the code where MachineLICM does this job: * Hexagon/reg-scavengebug-2.ll * Mips/lcb5.ll * Hexagon/swp-const-tc2.ll (added `-hoist-const-loads=false` flag as I am unsure about the test's purpose) --- .../AArch64/machine-licm-hoist-load.ll | 454 +++++++++--------- llvm/test/CodeGen/AArch64/sinksplat.ll | 54 +-- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 40 +- .../test/CodeGen/Hexagon/reg-scavengebug-2.ll | 2 +- llvm/test/CodeGen/Hexagon/swp-const-tc2.ll | 2 +- llvm/test/CodeGen/Mips/lcb5.ll | 6 +- 6 files changed, 288 insertions(+), 270 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll index 6b76b03fe00fc..01af84ea6922c 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll @@ -1,35 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s -define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; One dimensional loop with load that can be hoisted outside of loop +; for (int i = 0; i < N; ++i) +; if (!memcmp(a[i], b, 4)) +; sum += 1; +; +define i64 @one_dimensional(ptr %a, ptr %b, i64 %N) { ; CHECK-LABEL: one_dimensional: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cbz x2, .LBB0_4 -; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: .LBB0_2: // %for.body +; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x10, [x0], #8 ; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x2, x2, #1 -; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: // %bb.3: // %for.cond.cleanup -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: - %cmp4 = icmp eq i64 %N, 0 - br i1 %cmp4, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: ; preds = %for.body, %entry - %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.body ] - ret i64 %sum.0.lcssa + br label %for.body for.body: ; preds = %entry, %for.body %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ] @@ -37,56 +31,57 @@ for.body: ; preds = %entry, %for.body %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06 %0 = load ptr, ptr %arrayidx, align 8 %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) - %tobool = icmp ne i32 %bcmp, 0 + %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.05, %add %inc = add nuw i64 %i.06, 1 %exitcond = icmp eq i64 %inc, %N - br i1 %exitcond, label %for.cond.cleanup, label %for.body + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %spec.select } -define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; Same but loop is two dimensional. Load is hosted outside of both loops +; for (int i = 0; i < N; ++i) +; for (int j = 0; j < M; ++j) +; if (!memcmp(a[i][j], b, 4)) +; sum += 1; +; +define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M) { ; CHECK-LABEL: two_dimensional: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: cbz x2, .LBB1_6 -; CHECK-NEXT: // %bb.1: // %entry -; CHECK-NEXT: cbz x3, .LBB1_6 -; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader ; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader +; CHECK-NEXT: .LBB1_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 -; CHECK-NEXT: // Child Loop BB1_4 Depth 2 +; CHECK-NEXT: // Child Loop BB1_2 Depth 2 ; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] ; CHECK-NEXT: mov x12, x3 -; CHECK-NEXT: .LBB1_4: // %for.body4 -; CHECK-NEXT: // Parent Loop BB1_3 Depth=1 +; CHECK-NEXT: .LBB1_2: // %for.body4 +; CHECK-NEXT: // Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr x13, [x11], #8 ; CHECK-NEXT: ldr w13, [x13] ; CHECK-NEXT: cmp w13, w10 -; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x12, x12, #1 -; CHECK-NEXT: b.ne .LBB1_4 -; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge -; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: b.ne .LBB1_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.for.exit3_crit_edge +; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 -; CHECK-NEXT: b.ne .LBB1_3 -; CHECK-NEXT: .LBB1_6: // %for.cond.cleanup +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.4: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: - %cmp17 = icmp eq i64 %N, 0 - %cmp214 = icmp eq i64 %M, 0 - %or.cond = or i1 %cmp17, %cmp214 - br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge - %i.019 = phi i64 [ %inc7, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ] - %sum.018 = phi i64 [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ] + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond1.for.exit3_crit_edge + %i.019 = phi i64 [ %inc7, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ] + %sum.018 = phi i64 [ %spec.select, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019 %0 = load ptr, ptr %arrayidx, align 8 br label %for.body4 @@ -97,231 +92,220 @@ for.body4: ; preds = %for.cond1.preheader, % %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016 %1 = load ptr, ptr %arrayidx5, align 8 %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4) - %tobool = icmp ne i32 %bcmp, 0 + %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 %spec.select = add i64 %sum.115, %add %inc = add nuw i64 %j.016, 1 %exitcond = icmp eq i64 %inc, %M - br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge, label %for.body4 + br i1 %exitcond, label %for.cond1.for.exit3_crit_edge, label %for.body4 -for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4 +for.cond1.for.exit3_crit_edge: ; preds = %for.body4 %inc7 = add nuw i64 %i.019, 1 %exitcond22 = icmp eq i64 %inc7, %N - br i1 %exitcond22, label %for.cond.cleanup, label %for.cond1.preheader + br i1 %exitcond22, label %for.exit, label %for.cond1.preheader -for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %entry - %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ] - ret i64 %sum.0.lcssa +for.exit: ; preds = %for.cond1.for.exit3_crit_edge + ret i64 %spec.select } -define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { -; CHECK-LABEL: three_dimensional_middle: +; Same but loop is three dimensional. Load is hosted outside of all three loops +; for (int i = 0; i < N; ++i) +; for (int j = 0; j < M; ++j) +; for (int k = 0; k < K; ++k) +; if (!memcmp(a[i][j][k], b, 4)) +; sum += 1; +; +define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: three_dimensional: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: cbz x2, .LBB2_9 -; CHECK-NEXT: // %bb.1: // %entry -; CHECK-NEXT: cbz x3, .LBB2_9 -; CHECK-NEXT: // %bb.2: // %entry -; CHECK-NEXT: cbz x4, .LBB2_9 -; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader +; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: .LBB2_4: // %for.cond1.preheader +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB2_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 -; CHECK-NEXT: // Child Loop BB2_5 Depth 2 -; CHECK-NEXT: // Child Loop BB2_6 Depth 3 -; CHECK-NEXT: ldr x10, [x8, x9, lsl #3] -; CHECK-NEXT: mov x11, xzr -; CHECK-NEXT: .LBB2_5: // %for.cond5.preheader -; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 +; CHECK-NEXT: // Child Loop BB2_2 Depth 2 +; CHECK-NEXT: // Child Loop BB2_3 Depth 3 +; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] +; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: .LBB2_2: // %for.cond5.preheader +; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 -; CHECK-NEXT: // Child Loop BB2_6 Depth 3 -; CHECK-NEXT: lsl x12, x11, #3 +; CHECK-NEXT: // Child Loop BB2_3 Depth 3 +; CHECK-NEXT: ldr x13, [x11, x12, lsl #3] ; CHECK-NEXT: mov x14, x4 -; CHECK-NEXT: ldr x13, [x1, x12] -; CHECK-NEXT: ldr x12, [x10, x12] -; CHECK-NEXT: ldr w13, [x13] -; CHECK-NEXT: .LBB2_6: // %for.body8 -; CHECK-NEXT: // Parent Loop BB2_4 Depth=1 -; CHECK-NEXT: // Parent Loop BB2_5 Depth=2 +; CHECK-NEXT: .LBB2_3: // %for.body8 +; CHECK-NEXT: // Parent Loop BB2_1 Depth=1 +; CHECK-NEXT: // Parent Loop BB2_2 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldr x15, [x12], #8 +; CHECK-NEXT: ldr x15, [x13], #8 ; CHECK-NEXT: ldr w15, [x15] -; CHECK-NEXT: cmp w15, w13 -; CHECK-NEXT: cinc x0, x0, ne +; CHECK-NEXT: cmp w15, w10 +; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x14, x14, #1 -; CHECK-NEXT: b.ne .LBB2_6 -; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond -; CHECK-NEXT: // in Loop: Header=BB2_5 Depth=2 -; CHECK-NEXT: add x11, x11, #1 -; CHECK-NEXT: cmp x11, x3 -; CHECK-NEXT: b.ne .LBB2_5 -; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond -; CHECK-NEXT: // in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: b.ne .LBB2_3 +; CHECK-NEXT: // %bb.4: // %for.cond5.for.cond +; CHECK-NEXT: // in Loop: Header=BB2_2 Depth=2 +; CHECK-NEXT: add x12, x12, #1 +; CHECK-NEXT: cmp x12, x3 +; CHECK-NEXT: b.ne .LBB2_2 +; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond +; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 -; CHECK-NEXT: b.ne .LBB2_4 -; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup +; CHECK-NEXT: b.ne .LBB2_1 +; CHECK-NEXT: // %bb.6: // %for.exit +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: - %cmp33 = icmp eq i64 %N, 0 - %cmp229 = icmp eq i64 %M, 0 - %or.cond = or i1 %cmp33, %cmp229 - %cmp626 = icmp eq i64 %K, 0 - %or.cond48 = or i1 %or.cond, %cmp626 - br i1 %or.cond48, label %for.cond.cleanup, label %for.cond1.preheader + br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond - %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ] - %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] - %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035 + %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ] + %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033 %0 = load ptr, ptr %arrayidx, align 8 br label %for.cond5.preheader for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader - %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ] - %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] - %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031 + %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ] + %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] + %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029 %1 = load ptr, ptr %arrayidx9, align 8 - %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031 - %2 = load ptr, ptr %arrayidx11, align 8 br label %for.body8 for.body8: ; preds = %for.body8, %for.cond5.preheader - %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] - %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ] - %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028 - %3 = load ptr, ptr %arrayidx10, align 8 - %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4) - %tobool = icmp ne i32 %bcmp, 0 + %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] + %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ] + %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026 + %2 = load ptr, ptr %arrayidx10, align 8 + %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4) + %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 - %spec.select = add i64 %sum.227, %add - %inc = add nuw i64 %k.028, 1 + %spec.select = add i64 %sum.225, %add + %inc = add nuw i64 %k.026, 1 %exitcond = icmp eq i64 %inc, %K br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 for.cond5.for.cond: ; preds = %for.body8 - %inc13 = add nuw i64 %j.031, 1 - %exitcond46 = icmp eq i64 %inc13, %M - br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader + %inc12 = add nuw i64 %j.029, 1 + %exitcond44 = icmp eq i64 %inc12, %M + br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader for.cond1.for.cond: ; preds = %for.cond5.for.cond - %inc16 = add nuw i64 %i.035, 1 - %exitcond47 = icmp eq i64 %inc16, %N - br i1 %exitcond47, label %for.cond.cleanup, label %for.cond1.preheader + %inc15 = add nuw i64 %i.033, 1 + %exitcond45 = icmp eq i64 %inc15, %N + br i1 %exitcond45, label %for.exit, label %for.cond1.preheader -for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry - %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ] - ret i64 %sum.0.lcssa +for.exit: ; preds = %for.cond1.for.cond + ret i64 %spec.select } -define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { -; CHECK-LABEL: three_dimensional: +; Three dimensional loop but `b` is invariant only relatively to the inner loop. +; Make sure that load is hoisted only outside of first loop +; for (int i = 0; i < N; ++i) +; for (int j = 0; j < M; ++j) +; for (int k = 0; k < K; ++k) +; if (!memcmp(a[i][j][k], b[j], 4)) +; sum += 1; +; +define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) { +; CHECK-LABEL: three_dimensional_middle: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: cbz x2, .LBB3_9 -; CHECK-NEXT: // %bb.1: // %entry -; CHECK-NEXT: cbz x3, .LBB3_9 -; CHECK-NEXT: // %bb.2: // %entry -; CHECK-NEXT: cbz x4, .LBB3_9 -; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader -; CHECK-NEXT: ldr w10, [x1] ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader +; CHECK-NEXT: .LBB3_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 -; CHECK-NEXT: // Child Loop BB3_5 Depth 2 -; CHECK-NEXT: // Child Loop BB3_6 Depth 3 -; CHECK-NEXT: ldr x11, [x0, x9, lsl #3] -; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader -; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 +; CHECK-NEXT: // Child Loop BB3_2 Depth 2 +; CHECK-NEXT: // Child Loop BB3_3 Depth 3 +; CHECK-NEXT: ldr x10, [x0, x9, lsl #3] +; CHECK-NEXT: mov x11, xzr +; CHECK-NEXT: .LBB3_2: // %for.cond5.preheader +; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 ; CHECK-NEXT: // => This Loop Header: Depth=2 -; CHECK-NEXT: // Child Loop BB3_6 Depth 3 -; CHECK-NEXT: ldr x13, [x11, x12, lsl #3] +; CHECK-NEXT: // Child Loop BB3_3 Depth 3 +; CHECK-NEXT: lsl x12, x11, #3 ; CHECK-NEXT: mov x14, x4 -; CHECK-NEXT: .LBB3_6: // %for.body8 -; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 -; CHECK-NEXT: // Parent Loop BB3_5 Depth=2 +; CHECK-NEXT: ldr x13, [x1, x12] +; CHECK-NEXT: ldr x12, [x10, x12] +; CHECK-NEXT: ldr w13, [x13] +; CHECK-NEXT: .LBB3_3: // %for.body8 +; CHECK-NEXT: // Parent Loop BB3_1 Depth=1 +; CHECK-NEXT: // Parent Loop BB3_2 Depth=2 ; CHECK-NEXT: // => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldr x15, [x13], #8 +; CHECK-NEXT: ldr x15, [x12], #8 ; CHECK-NEXT: ldr w15, [x15] -; CHECK-NEXT: cmp w15, w10 -; CHECK-NEXT: cinc x8, x8, ne +; CHECK-NEXT: cmp w15, w13 +; CHECK-NEXT: cinc x8, x8, eq ; CHECK-NEXT: subs x14, x14, #1 -; CHECK-NEXT: b.ne .LBB3_6 -; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond -; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2 -; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: cmp x12, x3 -; CHECK-NEXT: b.ne .LBB3_5 -; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond -; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: b.ne .LBB3_3 +; CHECK-NEXT: // %bb.4: // %for.cond5.for.cond +; CHECK-NEXT: // in Loop: Header=BB3_2 Depth=2 +; CHECK-NEXT: add x11, x11, #1 +; CHECK-NEXT: cmp x11, x3 +; CHECK-NEXT: b.ne .LBB3_2 +; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: cmp x9, x2 -; CHECK-NEXT: b.ne .LBB3_4 -; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup +; CHECK-NEXT: b.ne .LBB3_1 +; CHECK-NEXT: // %bb.6: // %for.exit ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret entry: - %cmp31 = icmp eq i64 %N, 0 - %cmp227 = icmp eq i64 %M, 0 - %or.cond = or i1 %cmp31, %cmp227 - %cmp624 = icmp eq i64 %K, 0 - %or.cond46 = or i1 %or.cond, %cmp624 - br i1 %or.cond46, label %for.cond.cleanup, label %for.cond1.preheader + br label %for.cond1.preheader for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond - %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ] - %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] - %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033 + %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ] + %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035 %0 = load ptr, ptr %arrayidx, align 8 br label %for.cond5.preheader for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader - %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ] - %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] - %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029 + %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ] + %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ] + %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031 %1 = load ptr, ptr %arrayidx9, align 8 + %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031 + %2 = load ptr, ptr %arrayidx11, align 8 br label %for.body8 for.body8: ; preds = %for.body8, %for.cond5.preheader - %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] - %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ] - %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026 - %2 = load ptr, ptr %arrayidx10, align 8 - %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4) - %tobool = icmp ne i32 %bcmp, 0 + %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ] + %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ] + %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028 + %3 = load ptr, ptr %arrayidx10, align 8 + %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4) + %tobool = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool to i64 - %spec.select = add i64 %sum.225, %add - %inc = add nuw i64 %k.026, 1 + %spec.select = add i64 %sum.227, %add + %inc = add nuw i64 %k.028, 1 %exitcond = icmp eq i64 %inc, %K br i1 %exitcond, label %for.cond5.for.cond, label %for.body8 for.cond5.for.cond: ; preds = %for.body8 - %inc12 = add nuw i64 %j.029, 1 - %exitcond44 = icmp eq i64 %inc12, %M - br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader + %inc13 = add nuw i64 %j.031, 1 + %exitcond46 = icmp eq i64 %inc13, %M + br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader for.cond1.for.cond: ; preds = %for.cond5.for.cond - %inc15 = add nuw i64 %i.033, 1 - %exitcond45 = icmp eq i64 %inc15, %N - br i1 %exitcond45, label %for.cond.cleanup, label %for.cond1.preheader + %inc16 = add nuw i64 %i.035, 1 + %exitcond47 = icmp eq i64 %inc16, %N + br i1 %exitcond47, label %for.exit, label %for.cond1.preheader -for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry - %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ] - ret i64 %sum.0.lcssa +for.exit: ; preds = %for.cond1.for.cond + ret i64 %spec.select } -define i32 @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N, i32 %M, i32 %K) { +; Make sure that store inside loop prevents hoisting invariant loads +; for (int i = 0; i < N; ++i) +; c[i] = memcmp(a[i], b, 4); +; +define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) { ; CHECK-LABEL: one_dimensional_with_store: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB4_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: .LBB4_2: // %for.body +; CHECK-NEXT: .LBB4_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x9, [x0], #8 ; CHECK-NEXT: ldr w10, [x1] @@ -334,21 +318,16 @@ define i32 @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N, i32 %M, i ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: strb w9, [x2], #1 -; CHECK-NEXT: b.ne .LBB4_2 -; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup -; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: b.ne .LBB4_1 +; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: ret entry: - %cmp6 = icmp sgt i32 %N, 0 - br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + br label %for.body.preheader for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %N to i64 br label %for.body -for.cond.cleanup: ; preds = %for.body, %entry - ret i32 0 - for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv @@ -359,10 +338,15 @@ for.body: ; preds = %for.body.preheader, store i8 %conv, ptr %arrayidx2, align 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret void } -define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N, i32 %M, i32 %K) { +; Make sure that call inside loop prevents hoisting invariant loads +; +define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N) { ; CHECK-LABEL: one_dimensional_with_call: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill @@ -374,59 +358,95 @@ define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N, i32 %M, i32 %K) { ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: cmp w2, #1 -; CHECK-NEXT: b.lt .LBB5_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov x19, x1 ; CHECK-NEXT: mov x21, x0 ; CHECK-NEXT: mov w20, wzr ; CHECK-NEXT: mov w22, w2 -; CHECK-NEXT: .LBB5_2: // %for.body +; CHECK-NEXT: .LBB5_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x8, [x21], #8 ; CHECK-NEXT: ldr w9, [x19] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cinc w20, w20, ne +; CHECK-NEXT: cinc w20, w20, eq ; CHECK-NEXT: bl func ; CHECK-NEXT: subs x22, x22, #1 -; CHECK-NEXT: b.ne .LBB5_2 -; CHECK-NEXT: b .LBB5_4 -; CHECK-NEXT: .LBB5_3: -; CHECK-NEXT: mov w20, wzr -; CHECK-NEXT: .LBB5_4: // %for.cond.cleanup +; CHECK-NEXT: b.ne .LBB5_1 +; CHECK-NEXT: // %bb.2: // %for.exit ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: - %cmp4 = icmp sgt i32 %N, 0 - br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + br label %for.body.preheader for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %N to i64 br label %for.body -for.cond.cleanup: ; preds = %for.body, %entry - %sum.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ] - ret i32 %sum.0.lcssa - for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ] %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv %0 = load ptr, ptr %arrayidx, align 8 %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4) - %tobool.not = icmp ne i32 %bcmp, 0 + %tobool.not = icmp eq i32 %bcmp, 0 %add = zext i1 %tobool.not to i32 %spec.select = add nuw nsw i32 %sum.05, %add tail call void @func() %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %spec.select } +; One dimensional loop with memcmp size equal six. +; The test shows that shows that several loads can be hoisted at the same time. +; for (int i = 0; i < N; ++i) +; if (!memcmp(a[i], b, 6)) +; sum += 1; +; +define i64 @one_dimensional_two_loads(ptr %a, ptr %b, i64 %N) { +; CHECK-LABEL: one_dimensional_two_loads: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: ldrh w10, [x1, #4] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB6_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x11, [x0], #8 +; CHECK-NEXT: ldr w12, [x11] +; CHECK-NEXT: ldrh w11, [x11, #4] +; CHECK-NEXT: cmp w12, w9 +; CHECK-NEXT: ccmp w11, w10, #0, eq +; CHECK-NEXT: cinc x8, x8, eq +; CHECK-NEXT: subs x2, x2, #1 +; CHECK-NEXT: b.ne .LBB6_1 +; CHECK-NEXT: // %bb.2: // %for.exit +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06 + %0 = load ptr, ptr %arrayidx, align 8 + %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 6) + %tobool = icmp eq i32 %bcmp, 0 + %add = zext i1 %tobool to i64 + %spec.select = add i64 %sum.05, %add + %inc = add nuw i64 %i.06, 1 + %exitcond = icmp eq i64 %inc, %N + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %spec.select +} declare i32 @bcmp(ptr, ptr, i64) declare i32 @memcmp(ptr, ptr, i64) diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index cb63a4d78dc25..d156ec079ae94 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -1,17 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -hoist-const-loads=false -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s define <4 x i32> @smull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: smull: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: .LBB0_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3] +; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB0_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -38,12 +38,12 @@ define <4 x i32> @umull(<4 x i16> %x, ptr %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: .LBB1_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.h[3] +; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB1_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -69,14 +69,14 @@ define <4 x i32> @sqadd(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqadd: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: sqrdmulh v1.4s, v2.4s, v1.s[3] ; CHECK-NEXT: .LBB2_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3] -; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB2_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -102,14 +102,14 @@ define <4 x i32> @sqsub(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqsub: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: sqrdmulh v1.4s, v2.4s, v1.s[3] ; CHECK-NEXT: .LBB3_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3] -; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB3_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -135,14 +135,14 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: sqdmulh: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: sqdmulh v1.4s, v2.4s, v1.s[3] ; CHECK-NEXT: .LBB4_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.s[3] -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB4_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -168,14 +168,14 @@ define <4 x i32> @sqdmull(<4 x i16> %x, ptr %y) { ; CHECK-LABEL: sqdmull: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d1, d0 +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: sqdmull v1.4s, v2.4h, v1.h[3] ; CHECK-NEXT: .LBB5_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.h[3] -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB5_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -202,13 +202,13 @@ define <4 x i32> @mlal(<4 x i32> %x, ptr %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB6_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s +; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB6_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -234,14 +234,14 @@ define <4 x float> @fmul(<4 x float> %x, ptr %y) { ; CHECK-LABEL: fmul: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: fmul v1.4s, v2.4s, v1.s[3] ; CHECK-NEXT: .LBB7_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: fmul v2.4s, v2.4s, v1.s[3] -; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s ; CHECK-NEXT: b.eq .LBB7_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -268,13 +268,13 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB8_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: fmla v0.4s, v1.4s, v2.4s +; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB8_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -301,13 +301,13 @@ define <4 x float> @fma(<4 x float> %x, ptr %y) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB9_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov v3.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: fmla v0.4s, v3.4s, v2.4s ; CHECK-NEXT: b.eq .LBB9_1 @@ -338,11 +338,11 @@ define <4 x i32> @smull_nonsplat(<4 x i16> %x, ptr %y) { ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: trn2 v2.4h, v1.4h, v1.4h ; CHECK-NEXT: zip2 v1.4h, v2.4h, v1.4h +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: .LBB10_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB10_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index e1f1f5495e9af..e91b93a8a7440 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s ; CHECK-LABEL: lCPI0_0: ; CHECK-NEXT: .byte 0 ; 0x0 @@ -2756,40 +2756,38 @@ exit: define i32 @test_pr62620_widening_instr(ptr %p1, ptr %p2, i64 %lx, i32 %h) { ; CHECK-LABEL: test_pr62620_widening_instr: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: lsl x9, x2, #4 -; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: lsl x8, x2, #4 +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ldr q1, [x1, x8] +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: LBB23_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x9] -; CHECK-NEXT: ldr q1, [x1, x9] -; CHECK-NEXT: subs w3, w3, #1 ; CHECK-NEXT: uabdl.8h v2, v0, v1 +; CHECK-NEXT: subs w3, w3, #1 ; CHECK-NEXT: uabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 -; CHECK-NEXT: fmov w10, s0 -; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: uaddlv.8h s2, v2 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: add w0, w8, w0 ; CHECK-NEXT: b.ne LBB23_1 ; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pr62620_widening_instr: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: lsl x9, x2, #4 -; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: lsl x8, x2, #4 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x1, x8 ; CHECK-BE-NEXT: mov w0, wzr -; CHECK-BE-NEXT: add x8, x8, x9 -; CHECK-BE-NEXT: add x9, x1, x9 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] ; CHECK-BE-NEXT: .LBB23_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] -; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] -; CHECK-BE-NEXT: subs w3, w3, #1 ; CHECK-BE-NEXT: uabdl v2.8h, v0.8b, v1.8b +; CHECK-BE-NEXT: subs w3, w3, #1 ; CHECK-BE-NEXT: uabal2 v2.8h, v0.16b, v1.16b -; CHECK-BE-NEXT: uaddlv s0, v2.8h -; CHECK-BE-NEXT: fmov w10, s0 -; CHECK-BE-NEXT: add w0, w10, w0 +; CHECK-BE-NEXT: uaddlv s2, v2.8h +; CHECK-BE-NEXT: fmov w8, s2 +; CHECK-BE-NEXT: add w0, w8, w0 ; CHECK-BE-NEXT: b.ne .LBB23_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll index 150d7e271818a..4b481a222c868 100644 --- a/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll +++ b/llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll @@ -1,5 +1,5 @@ ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s -; CHECK: v{{[0-9]+}}.cur = vmem(r{{[0-9]+}}+#0) +; CHECK: v{{[0-9]+}} = vmem(r{{[0-9]+}}+#0) target triple = "hexagon" diff --git a/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll b/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll index 73268f160fdd6..859cc7eb17b6b 100644 --- a/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll +++ b/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true -hoist-const-loads=false | FileCheck %s ; Test that we fixup a pipelined loop correctly when the number of ; stages is greater than the compile-time loop trip count. In this diff --git a/llvm/test/CodeGen/Mips/lcb5.ll b/llvm/test/CodeGen/Mips/lcb5.ll index 9c12978a88440..f320f6fc5660c 100644 --- a/llvm/test/CodeGen/Mips/lcb5.ll +++ b/llvm/test/CodeGen/Mips/lcb5.ll @@ -186,7 +186,7 @@ if.end: ; preds = %if.then, %entry } ; ci: .ent z3 -; ci: bteqz $BB6_2 +; ci: bteqz $BB6_3 ; ci: .end z3 ; Function Attrs: nounwind optsize @@ -210,10 +210,10 @@ if.end: ; preds = %if.then, %entry ; ci: .ent z4 ; ci: btnez $BB7_1 # 16 bit inst -; ci: jal $BB7_2 # branch +; ci: jal $BB7_3 # branch ; ci: nop -; ci: .p2align 2 ; ci: $BB7_1: +; ci: .p2align 2 ; ci: .end z4 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } From b5fbed269de078ab2fc4034544cdf949821eb58c Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Mon, 6 Nov 2023 17:09:50 +0000 Subject: [PATCH 5/7] Refactor code --- llvm/lib/CodeGen/MachineLICM.cpp | 95 ++++++++++++++++---------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index ef4a9f80549e9..6963babb1429a 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -137,6 +137,9 @@ namespace { bool Changed = false; // True if a loop is changed. bool FirstInLoop = false; // True if it's the first LICM in the loop. + // Holds information about whether it is allowed to move load instructions + // out of the loop + SmallDenseMap AllowedToHoistLoads; // Exit blocks of each Loop. DenseMap> ExitBlockMap; @@ -227,11 +230,9 @@ namespace { void AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop); - bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop, - bool SafeToMoveLoad); + bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop); - bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop, - bool SafeToMoveLoad); + bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop); bool HasLoopPHIUse(const MachineInstr *MI, MachineLoop *CurLoop); @@ -284,7 +285,7 @@ namespace { bool MayCSE(MachineInstr *MI); unsigned Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, - MachineLoop *CurLoop, bool SafeToMoveLoad); + MachineLoop *CurLoop); void InitCSEMap(MachineBasicBlock *BB); @@ -376,6 +377,36 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { AA = &getAnalysis().getAAResults(); SmallVector Worklist(MLI->begin(), MLI->end()); + + // Initialize `AllowedToHoistLoads' if needed. + if (HoistConstLoads) { + auto TmpWorklist = Worklist; + // Initialize all loops with true values + while (!TmpWorklist.empty()) { + auto *L = TmpWorklist.pop_back_val(); + AllowedToHoistLoads[L] = true; + TmpWorklist.insert(TmpWorklist.end(), L->getSubLoops().begin(), + L->getSubLoops().end()); + } + // Go through all the instructions inside top-level loops and, after finding + // one that makes it potentially unsafe to move loads, update load hoisting + // information for each loop containing this instruction. + for (auto *TopLoop : Worklist) { + for (auto *MBB : TopLoop->blocks()) { + for (auto &MI : *MBB) { + if (!MI.mayStore() && !MI.isCall() && + !(MI.mayLoad() && MI.hasOrderedMemoryRef())) + continue; + for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != TopLoop; + L = L->getParentLoop()) + AllowedToHoistLoads[L] = false; + AllowedToHoistLoads[TopLoop] = false; + break; + } + } + } + } + while (!Worklist.empty()) { MachineLoop *CurLoop = Worklist.pop_back_val(); MachineBasicBlock *CurPreheader = nullptr; @@ -501,7 +532,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // operands. FIXME: Consider unfold load folding instructions. if (Def && !RuledOut) { int FI = std::numeric_limits::min(); - if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop, false)) || + if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop)) || (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI))) Candidates.push_back(CandidateInfo(MI, Def, FI)); } @@ -779,33 +810,6 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, BackTrace.clear(); InitRegPressure(Preheader); - // Compute information about whether it is allowed to move load instruction - // out of the current loop or one of the inner loops - SmallDenseMap AllowedToHoistLoads; - if (HoistConstLoads) { - SmallVector Worklist{CurLoop}; - - while (!Worklist.empty()) { - auto *L = Worklist.pop_back_val(); - AllowedToHoistLoads[L] = true; - Worklist.insert(Worklist.end(), L->getSubLoops().begin(), - L->getSubLoops().end()); - } - - for (auto *MBB : CurLoop->blocks()) { - for (auto &MI : *MBB) { - if (MI.mayStore() || MI.isCall() || - (MI.mayLoad() && MI.hasOrderedMemoryRef())) { - for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop; - L = L->getParentLoop()) - AllowedToHoistLoads[L] = false; - AllowedToHoistLoads[CurLoop] = false; - break; - } - } - } - } - // Now perform LICM. for (MachineDomTreeNode *Node : Scopes) { MachineBasicBlock *MBB = Node->getBlock(); @@ -814,10 +818,9 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, // Process the block SpeculationState = SpeculateUnknown; - bool SafeToMoveLoad = HoistConstLoads && AllowedToHoistLoads[CurLoop]; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { unsigned HoistRes = HoistResult::NotHoisted; - HoistRes = Hoist(&MI, Preheader, CurLoop, SafeToMoveLoad); + HoistRes = Hoist(&MI, Preheader, CurLoop); if (HoistRes & HoistResult::NotHoisted) { // We have failed to hoist MI to outermost loop's preheader. If MI is in // a subloop, try to hoist it to subloop's preheader. @@ -828,12 +831,9 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, while (!InnerLoopWorkList.empty()) { MachineLoop *InnerLoop = InnerLoopWorkList.pop_back_val(); - bool SafeToMoveLoadInner = - HoistConstLoads && AllowedToHoistLoads[InnerLoop]; MachineBasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); if (InnerLoopPreheader) { - HoistRes = - Hoist(&MI, InnerLoopPreheader, InnerLoop, SafeToMoveLoadInner); + HoistRes = Hoist(&MI, InnerLoopPreheader, InnerLoop); if (HoistRes & HoistResult::Hoisted) break; } @@ -1028,10 +1028,9 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI, /// Returns true if the instruction may be a suitable candidate for LICM. /// e.g. If the instruction is a call, then it's obviously not safe to hoist it. -bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop, - bool SafeToMoveLoad) { +bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) { // Check if it's safe to move the instruction. - bool DontMoveAcrossStore = !SafeToMoveLoad; + bool DontMoveAcrossStore = !HoistConstLoads || !AllowedToHoistLoads[CurLoop]; if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) && !(HoistConstStores && isInvariantStore(I, TRI, MRI))) { LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n"); @@ -1064,9 +1063,9 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop, } /// Returns true if the instruction is loop invariant. -bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop, - bool SafeToMoveLoad) { - if (!IsLICMCandidate(I, CurLoop, SafeToMoveLoad)) { +bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, + MachineLoop *CurLoop) { + if (!IsLICMCandidate(I, CurLoop)) { LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n"); return false; } @@ -1344,7 +1343,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI, MBB->insert(Pos, NewMIs[1]); // If unfolding produced a load that wasn't loop-invariant or profitable to // hoist, discard the new instructions and bail. - if (!IsLoopInvariantInst(*NewMIs[0], CurLoop, /*SaveToMovLoad=*/false) || + if (!IsLoopInvariantInst(*NewMIs[0], CurLoop) || !IsProfitableToHoist(*NewMIs[0], CurLoop)) { NewMIs[0]->eraseFromParent(); NewMIs[1]->eraseFromParent(); @@ -1471,7 +1470,7 @@ bool MachineLICMBase::MayCSE(MachineInstr *MI) { /// that are safe to hoist, this instruction is called to do the dirty work. /// It returns true if the instruction is hoisted. unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, - MachineLoop *CurLoop, bool SafeToMoveLoad) { + MachineLoop *CurLoop) { MachineBasicBlock *SrcBlock = MI->getParent(); // Disable the instruction hoisting due to block hotness @@ -1483,7 +1482,7 @@ unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, } // First check whether we should hoist this instruction. bool HasExtractHoistableLoad = false; - if (!IsLoopInvariantInst(*MI, CurLoop, SafeToMoveLoad) || + if (!IsLoopInvariantInst(*MI, CurLoop) || !IsProfitableToHoist(*MI, CurLoop)) { // If not, try unfolding a hoistable load. MI = ExtractHoistableLoad(MI, CurLoop); From 2794d6df70fdb8bc16914fd792204888b53e8e3e Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Tue, 7 Nov 2023 17:39:15 +0000 Subject: [PATCH 6/7] Update fma-commute-loop.ll after removing flag --- llvm/test/CodeGen/X86/fma-commute-loop.ll | 64 +++++++++++------------ 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll index a22e5d2e5e0c3..d26aaaa71e28d 100644 --- a/llvm/test/CodeGen/X86/fma-commute-loop.ll +++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll @@ -1,60 +1,58 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f -hoist-const-loads=false | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind { ; CHECK-LABEL: eggs: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; CHECK-NEXT: leaq (%r12,%r14,8), %r14 -; CHECK-NEXT: leaq (%r12,%r15,8), %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: leaq (%rbx,%r10,8), %r10 +; CHECK-NEXT: leaq (%rbx,%r11,8), %r11 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %r12d, %r12d -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; CHECK-NEXT: addq %rbx, %r13 -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm1 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: addq %r12, %r15 +; CHECK-NEXT: vmovupd (%r14,%r15,8), %zmm2 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12 +; CHECK-NEXT: vmovupd (%r14,%r12,8), %zmm8 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; CHECK-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vxorpd %xmm7, %xmm7, %xmm7 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %bb15 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6 -; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7 -; CHECK-NEXT: vmovupd (%rax,%rbx,8), %zmm8 -; CHECK-NEXT: vbroadcastsd (%r15,%r12,8), %zmm9 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2 -; CHECK-NEXT: vbroadcastsd (%r14,%r12,8), %zmm9 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5 -; CHECK-NEXT: incq %r12 -; CHECK-NEXT: cmpq %r12, %r10 +; CHECK-NEXT: vbroadcastsd (%r11,%rbx,8), %zmm9 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm9) + zmm0 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm2 * zmm9) + zmm3 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm8 * zmm9) + zmm4 +; CHECK-NEXT: vbroadcastsd (%r10,%rbx,8), %zmm9 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm1 * zmm9) + zmm5 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm6 = (zmm2 * zmm9) + zmm6 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm7 = (zmm8 * zmm9) + zmm7 +; CHECK-NEXT: incq %rbx +; CHECK-NEXT: cmpq %rbx, %rax ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %bb51 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) -; CHECK-NEXT: vmovapd %zmm1, (%rsi) -; CHECK-NEXT: vmovapd %zmm2, (%rdx) -; CHECK-NEXT: vmovapd %zmm3, (%rcx) -; CHECK-NEXT: vmovapd %zmm4, (%r8) -; CHECK-NEXT: vmovapd %zmm5, (%r9) +; CHECK-NEXT: vmovapd %zmm3, (%rsi) +; CHECK-NEXT: vmovapd %zmm4, (%rdx) +; CHECK-NEXT: vmovapd %zmm5, (%rcx) +; CHECK-NEXT: vmovapd %zmm6, (%r8) +; CHECK-NEXT: vmovapd %zmm7, (%r9) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: vzeroupper From 37c7bc05a8651c27b90e307003b7fd946eb28373 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Fri, 10 Nov 2023 12:54:08 +0000 Subject: [PATCH 7/7] Refatctor initialization to a separate function Also improve the algorithm to early exit when loop chain is proven to be unhoistable --- llvm/lib/CodeGen/MachineLICM.cpp | 75 +++++++++++++++++++------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 6963babb1429a..b9df5bcdc0e6f 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -289,6 +289,8 @@ namespace { void InitCSEMap(MachineBasicBlock *BB); + void InitializeLoadsHoistableLoops(); + bool isTgtHotterThanSrc(MachineBasicBlock *SrcBlock, MachineBasicBlock *TgtBlock); MachineBasicBlock *getCurPreheader(MachineLoop *CurLoop, @@ -376,37 +378,10 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { DT = &getAnalysis(); AA = &getAnalysis().getAAResults(); - SmallVector Worklist(MLI->begin(), MLI->end()); - - // Initialize `AllowedToHoistLoads' if needed. - if (HoistConstLoads) { - auto TmpWorklist = Worklist; - // Initialize all loops with true values - while (!TmpWorklist.empty()) { - auto *L = TmpWorklist.pop_back_val(); - AllowedToHoistLoads[L] = true; - TmpWorklist.insert(TmpWorklist.end(), L->getSubLoops().begin(), - L->getSubLoops().end()); - } - // Go through all the instructions inside top-level loops and, after finding - // one that makes it potentially unsafe to move loads, update load hoisting - // information for each loop containing this instruction. - for (auto *TopLoop : Worklist) { - for (auto *MBB : TopLoop->blocks()) { - for (auto &MI : *MBB) { - if (!MI.mayStore() && !MI.isCall() && - !(MI.mayLoad() && MI.hasOrderedMemoryRef())) - continue; - for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != TopLoop; - L = L->getParentLoop()) - AllowedToHoistLoads[L] = false; - AllowedToHoistLoads[TopLoop] = false; - break; - } - } - } - } + if (HoistConstLoads) + InitializeLoadsHoistableLoops(); + SmallVector Worklist(MLI->begin(), MLI->end()); while (!Worklist.empty()) { MachineLoop *CurLoop = Worklist.pop_back_val(); MachineBasicBlock *CurPreheader = nullptr; @@ -1371,6 +1346,46 @@ void MachineLICMBase::InitCSEMap(MachineBasicBlock *BB) { CSEMap[BB][MI.getOpcode()].push_back(&MI); } +/// Initialize AllowedToHoistLoads with information about whether invariant +/// loads can be moved outside a given loop +void MachineLICMBase::InitializeLoadsHoistableLoops() { + SmallVector Worklist(MLI->begin(), MLI->end()); + SmallVector LoopsInPreOrder; + + // Mark all loops as hoistable initially and prepare a list of loops in + // pre-order DFS. + while (!Worklist.empty()) { + auto *L = Worklist.pop_back_val(); + AllowedToHoistLoads[L] = true; + LoopsInPreOrder.push_back(L); + Worklist.insert(Worklist.end(), L->getSubLoops().begin(), + L->getSubLoops().end()); + } + + // Going from the innermost to outermost loops, check if a loop has + // instructions preventing invariant load hoisting. If such instruction is + // found, mark this loop and its parent as non-hoistable and continue + // investigating the next loop. + // Visiting in a reversed pre-ordered DFS manner + // allows us to not process all the instructions of the outer loop if the + // inner loop is proved to be non-load-hoistable. + for (auto *Loop : reverse(LoopsInPreOrder)) { + for (auto *MBB : Loop->blocks()) { + // If this loop has already been marked as non-hoistable, skip it. + if (!AllowedToHoistLoads[Loop]) + continue; + for (auto &MI : *MBB) { + if (!MI.mayStore() && !MI.isCall() && + !(MI.mayLoad() && MI.hasOrderedMemoryRef())) + continue; + for (MachineLoop *L = Loop; L != nullptr; L = L->getParentLoop()) + AllowedToHoistLoads[L] = false; + break; + } + } + } +} + /// Find an instruction amount PrevMIs that is a duplicate of MI. /// Return this instruction if it's found. MachineInstr *