diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 85a7663993a04..5973b63b5a802 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1099,6 +1099,36 @@ bool AArch64RegisterInfo::getRegAllocationHints( const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); + // The SVE calling convention preserves registers Z8-Z23. As a result, there + // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the + // callee-saved registers and so by default these will be pushed to the back + // of the allocation order for the ZPRStridedOrContiguous classes. + // If any of the instructions which define VirtReg are used by the + // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy + // instructions over reducing the number of clobbered callee-save registers, + // so we add the strided registers as a hint. + unsigned RegID = MRI.getRegClass(VirtReg)->getID(); + // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. + if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || + RegID == AArch64::ZPR4StridedOrContiguousRegClassID) && + any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) { + return Use.getOpcode() == + AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + })) { + const TargetRegisterClass *StridedRC = + RegID == AArch64::ZPR2StridedOrContiguousRegClassID + ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; + + for (MCPhysReg Reg : Order) + if (StridedRC->contains(Reg)) + Hints.push_back(Reg); + + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); + } + for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index ef569e480ea3d..86ed63d743713 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -602,8 +602,42 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: udot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -650,10 +684,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: udot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -744,8 +850,42 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -792,10 +932,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -888,8 +1100,42 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -936,10 +1182,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -1032,8 +1350,42 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sudot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) ret void } @@ -1080,10 +1432,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: sudot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index 49106e12378be..e7d1050b60799 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -94,8 +94,43 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: svdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -142,10 +177,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: svdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -210,8 +317,43 @@ entry: %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) %5 = extractvalue { , } %4, 0 %6 = extractvalue { , } %4, 1 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + ret void +} + +define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: uvdot_form_2x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) ret void } @@ -258,10 +400,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: uvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -323,10 +537,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: suvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void } @@ -388,10 +674,82 @@ entry: %18 = extractvalue { , , , } %16, 1 %19 = extractvalue { , , , } %16, 2 %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, undef, i32 0) - tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, undef, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + ret void +} + +define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +; CHECK-LABEL: usvdot_form_4x_tuple_svecc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-9 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] +; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) ret void }