From 8f466f8d4850e015d79cffc65abbd423624795d9 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Fri, 13 Dec 2024 09:47:48 +0000 Subject: [PATCH 1/6] Add SME2 dot tests using the SVE calling convention --- .../AArch64/sme2-intrinsics-int-dots.ll | 456 ++++++++++++++++++ .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 390 +++++++++++++++ 2 files changed, 846 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index ef569e480ea3d..0d8ae5a71f141 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -607,6 +607,40 @@ entry: ret void } +define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: udot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: udot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -657,6 +691,86 @@ entry: ret void } +define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: udot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + define void @udot_lane_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: udot_lane_za64_u16_vg1x2: ; CHECK: // %bb.0: @@ -749,6 +863,40 @@ entry: ret void } +define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: usdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -799,6 +947,86 @@ entry: ret void } +define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + ; == Multi, indexed (signed) == define void @sdot_lane_za32_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #0 { @@ -893,6 +1121,40 @@ entry: ret void } +define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: sdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -943,6 +1205,86 @@ entry: ret void } +define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + define void @sdot_lane_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2: ; CHECK: // %bb.0: @@ -1037,6 +1379,40 @@ entry: ret void } +define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sudot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: sudot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -1087,6 +1463,86 @@ entry: ret void } +define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sudot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + attributes #0 = { nounwind "target-features"="+sme2" } attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index 49106e12378be..f686c9b228d6f 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -99,6 +99,41 @@ entry: ret void } +define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: svdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: svdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -149,6 +184,86 @@ entry: ret void } +define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: svdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + ; == UVDOT == define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, %zn1, %zn2, %zm) { @@ -215,6 +330,41 @@ entry: ret void } +define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: uvdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + ret void +} + define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-LABEL: uvdot_form_4x_tuple: ; CHECK: // %bb.0: // %entry @@ -265,6 +415,86 @@ entry: ret void } +define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: uvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + ; == SUVDOT == define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { @@ -330,6 +560,86 @@ entry: ret void } +define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: suvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + ; == USVDOT == define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, %zn1, %zn2, %zn3, %zn4, %zm) { @@ -395,6 +705,86 @@ entry: ret void } +define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: mov z9.d, z4.d +; CHECK-NEXT: mov z10.d, z24.d +; CHECK-NEXT: mov z11.d, z28.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z1.d +; CHECK-NEXT: mov z9.d, z5.d +; CHECK-NEXT: mov z10.d, z25.d +; CHECK-NEXT: mov z11.d, z29.d +; CHECK-NEXT: mov z1.d, z7.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z8.d, z2.d +; CHECK-NEXT: mov z9.d, z6.d +; CHECK-NEXT: mov z10.d, z26.d +; CHECK-NEXT: mov z11.d, z30.d +; CHECK-NEXT: mov z2.d, z27.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + ret void +} + attributes #0 = { nounwind "target-features"="+sme2" } attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } From 53e62428141c0091384617d80d10f7b03c54f822 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Fri, 13 Dec 2024 10:08:50 +0000 Subject: [PATCH 2/6] [AArch64][SME2] Extend getRegAllocationHints for ZPRStridedOrContiguousReg ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE pseudo should attempt to assign a strided register to avoid unnecessary copies, even though this may overlap with the list of SVE callee-saved registers. --- .../Target/AArch64/AArch64RegisterInfo.cpp | 48 +++ .../AArch64/sme2-intrinsics-int-dots.ll | 304 ++++++++---------- .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 268 +++++++-------- 3 files changed, 302 insertions(+), 318 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 85a7663993a04..6900c93f13a25 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1098,6 +1098,54 @@ bool AArch64RegisterInfo::getRegAllocationHints( SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned RegID = MRI.getRegClass(VirtReg)->getID(); + + // Since the SVE calling convention preserves registers Z8-Z23, there are no + // ZPR2Strided or ZPR4Strided registers which do not overlap with the + // callee-saved registers. These will be pushed to the back of the allocation + // order for the ZPRStridedOrContiguous classes. + // However, if any of the instructions which define VirtReg are + // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE + // pseudo, it will likely be better to try assigning a strided register + // anyway to avoid extra copy instructions. + + if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || + RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { + + if (!MF.getInfo()->isSVECC()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + + for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { + // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE + // pseudo is found in the uses, set HintStrided. + bool HintStrided = false; + for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { + unsigned UseOp = Use.getOpcode(); + if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) { + HintStrided = true; + break; + } + } + + if (!HintStrided) + continue; + + // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to + // allocate these first. + TargetRegisterClass StridedRC = + RegID == AArch64::ZPR2StridedOrContiguousRegClassID + ? AArch64::ZPR2StridedRegClass + : AArch64::ZPR4StridedRegClass; + + for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF)) + Hints.push_back(Reg); + } + + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); + } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 0d8ae5a71f141..109b4bc750d3f 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -611,20 +611,20 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: udot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -695,46 +695,38 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: udot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -867,20 +859,20 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: usdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -951,46 +943,38 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: usdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1125,20 +1109,20 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1209,46 +1193,38 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1383,20 +1359,20 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sudot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1467,46 +1443,38 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sudot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index f686c9b228d6f..016f5f55c825b 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -103,21 +103,21 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: svdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9] +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -188,46 +188,38 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: svdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -334,21 +326,21 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: uvdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9] +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0] -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: mov z1.d, z3.d +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -419,46 +411,38 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: mov z9.d, z4.d -; CHECK-NEXT: mov z10.d, z24.d -; CHECK-NEXT: mov z11.d, z28.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z1.d -; CHECK-NEXT: mov z9.d, z5.d -; CHECK-NEXT: mov z10.d, z25.d -; CHECK-NEXT: mov z11.d, z29.d -; CHECK-NEXT: mov z1.d, z7.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z8.d, z2.d -; CHECK-NEXT: mov z9.d, z6.d -; CHECK-NEXT: mov z10.d, z26.d -; CHECK-NEXT: mov z11.d, z30.d -; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: mov z3.d, z31.d +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -564,46 +548,38 @@ define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, Date: Tue, 17 Dec 2024 13:32:43 +0000 Subject: [PATCH 3/6] - Replaced HintStrided with any_of - Removed loop over VirtReg def instructions - Add registers from Order to Hints if they are contained in StridedRC --- .../Target/AArch64/AArch64RegisterInfo.cpp | 45 +++++++------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 6900c93f13a25..a806d53db806e 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1108,43 +1108,32 @@ bool AArch64RegisterInfo::getRegAllocationHints( // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE // pseudo, it will likely be better to try assigning a strided register // anyway to avoid extra copy instructions. - if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { - if (!MF.getInfo()->isSVECC()) - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); - - for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { - // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE - // pseudo is found in the uses, set HintStrided. - bool HintStrided = false; - for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { - unsigned UseOp = Use.getOpcode(); - if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) { - HintStrided = true; - break; - } - } - - if (!HintStrided) - continue; + // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE + // pseudo is found in the uses, set HintStrided. + if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) { + return Use.getOpcode() == + AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + Use.getOpcode() == + AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + })) { // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to // allocate these first. - TargetRegisterClass StridedRC = + const TargetRegisterClass *StridedRC = RegID == AArch64::ZPR2StridedOrContiguousRegClassID - ? AArch64::ZPR2StridedRegClass - : AArch64::ZPR4StridedRegClass; + ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; - for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF)) - Hints.push_back(Reg); - } + for (MCPhysReg Reg : Order) + if (StridedRC->contains(Reg)) + Hints.push_back(Reg); - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, - VRM); + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + } } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { From 1ac19922e6192f7f5023217851694e34439b909e Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 17 Dec 2024 15:03:59 +0000 Subject: [PATCH 4/6] - Replace uses of undef with poison in sme2 dot intrinsic tests --- .../AArch64/sme2-intrinsics-int-dots.ll | 96 +++++++++---------- .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 80 ++++++++-------- 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 109b4bc750d3f..86ed63d743713 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -602,8 +602,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -636,8 +636,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -684,10 +684,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -756,10 +756,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -850,8 +850,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -884,8 +884,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -932,10 +932,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1004,10 +1004,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1100,8 +1100,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -1134,8 +1134,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -1182,10 +1182,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1254,10 +1254,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1350,8 +1350,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -1384,8 +1384,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -1432,10 +1432,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1504,10 +1504,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index 016f5f55c825b..e7d1050b60799 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -94,8 +94,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -129,8 +129,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -177,10 +177,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -249,10 +249,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -317,8 +317,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -352,8 +352,8 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -400,10 +400,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -472,10 +472,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -537,10 +537,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -609,10 +609,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -674,10 +674,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -746,10 +746,10 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } From 70b53175ed406b6d8582deef0b74802a15fad752 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 18 Dec 2024 11:30:12 +0000 Subject: [PATCH 5/6] - Combine if conditions in getRegAllocationHints --- .../Target/AArch64/AArch64RegisterInfo.cpp | 49 +++++++++---------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index a806d53db806e..8fcf79cecb58d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1098,7 +1098,6 @@ bool AArch64RegisterInfo::getRegAllocationHints( SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegID = MRI.getRegClass(VirtReg)->getID(); // Since the SVE calling convention preserves registers Z8-Z23, there are no // ZPR2Strided or ZPR4Strided registers which do not overlap with the @@ -1108,32 +1107,28 @@ bool AArch64RegisterInfo::getRegAllocationHints( // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE // pseudo, it will likely be better to try assigning a strided register // anyway to avoid extra copy instructions. - if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || - RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { - - // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE - // pseudo is found in the uses, set HintStrided. - if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) { - return Use.getOpcode() == - AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - Use.getOpcode() == - AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; - })) { - - // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to - // allocate these first. - const TargetRegisterClass *StridedRC = - RegID == AArch64::ZPR2StridedOrContiguousRegClassID - ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; - - for (MCPhysReg Reg : Order) - if (StridedRC->contains(Reg)) - Hints.push_back(Reg); - - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); - } + unsigned RegID = MRI.getRegClass(VirtReg)->getID(); + // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. + if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || + RegID == AArch64::ZPR4StridedOrContiguousRegClassID) && + any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) { + return Use.getOpcode() == + AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + })) { + // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to + // allocate these first. + const TargetRegisterClass *StridedRC = + RegID == AArch64::ZPR2StridedOrContiguousRegClassID + ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; + + for (MCPhysReg Reg : Order) + if (StridedRC->contains(Reg)) + Hints.push_back(Reg); + + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { From 52f2bd3b3b09bbfce2855233b803fd33e8b200a4 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 18 Dec 2024 13:38:12 +0000 Subject: [PATCH 6/6] - Reword comments in getRegAllocationHints --- .../lib/Target/AArch64/AArch64RegisterInfo.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 8fcf79cecb58d..5973b63b5a802 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1099,14 +1099,14 @@ bool AArch64RegisterInfo::getRegAllocationHints( const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - // Since the SVE calling convention preserves registers Z8-Z23, there are no - // ZPR2Strided or ZPR4Strided registers which do not overlap with the - // callee-saved registers. These will be pushed to the back of the allocation - // order for the ZPRStridedOrContiguous classes. - // However, if any of the instructions which define VirtReg are - // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE - // pseudo, it will likely be better to try assigning a strided register - // anyway to avoid extra copy instructions. + // The SVE calling convention preserves registers Z8-Z23. As a result, there + // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the + // callee-saved registers and so by default these will be pushed to the back + // of the allocation order for the ZPRStridedOrContiguous classes. + // If any of the instructions which define VirtReg are used by the + // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy + // instructions over reducing the number of clobbered callee-save registers, + // so we add the strided registers as a hint. unsigned RegID = MRI.getRegClass(VirtReg)->getID(); // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || @@ -1116,8 +1116,6 @@ bool AArch64RegisterInfo::getRegAllocationHints( AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; })) { - // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to - // allocate these first. const TargetRegisterClass *StridedRC = RegID == AArch64::ZPR2StridedOrContiguousRegClassID ? &AArch64::ZPR2StridedRegClass