From 8ba638551a21709036b77684fcca4b2d7e793e0e Mon Sep 17 00:00:00 2001 From: chengjunp Date: Tue, 25 Jun 2024 17:20:27 +0000 Subject: [PATCH 1/6] [NVPTX] Support inline asm with 128-bit operand in NVPTX backend --- clang/lib/Basic/Targets/NVPTX.h | 1 + .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 + llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 2 + llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 68 +++++++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 94 +++++++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 2 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 12 +- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 3 + .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 92 +++++++++ .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 57 ++++++ .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 179 ++++++++++++++++++ 14 files changed, 525 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index a5daf36cfac72..9a985e46e22da 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { case 'l': case 'f': case 'd': + case 'q': Info.setAllowsRegister(); return true; } diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index b7a20c351f5ff..380d878c1f532 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -60,6 +60,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const { case 6: OS << "%fd"; break; + case 7: + OS << "%rq"; + break; } unsigned VReg = Reg.id() & 0x0FFFFFFF; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ca077d41d36ba..1645261d74d06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -315,6 +315,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { Ret = (5 << 28); } else if (RC == &NVPTX::Float64RegsRegClass) { Ret = (6 << 28); + } else if (RC == &NVPTX::Int128RegsRegClass) { + Ret = (7 << 28); } else { report_fatal_error("Bad register class"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 1e1cbb15e33d4..05706e200bda6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -519,6 +519,20 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryConstantFP(N)) return; break; + case ISD::CopyToReg: { + if (N->getOperand(1).getValueType() == MVT::i128) { + SelectV2I64toI128(N); + return; + } + break; + } + case ISD::CopyFromReg: { + if(N->getOperand(1).getValueType() == MVT::i128){ + SelectI128toV2I64(N); + return; + } + break; + } default: break; } @@ -3798,6 +3812,60 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( return true; } +void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { + // Lower a CopyToReg with two 64-bit inputs + // Dst:i128, lo:i64, hi:i64 + // + // CopyToReg Dst, lo, hi; + // + // ==> + // + // tmp = V2I64toI128 {lo, hi}; + // CopyToReg Dst, tmp; + SDValue Dst = N->getOperand(1); + SDValue Lo = N->getOperand(2); + SDValue Hi = N->getOperand(3); + + SDLoc DL(N); + SDNode *Mov = + CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi}); + + SmallVector ResultsType(N->value_begin(), N->value_end()); + SmallVector NewOps(N->getNumOperands() - 1); + NewOps[0] = N->getOperand(0); + NewOps[1] = Dst; + NewOps[2] = SDValue(Mov, 0); + if (N->getNumOperands() == 5) + NewOps[3] = N->getOperand(4); + SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps); + + ReplaceNode(N, NewValue.getNode()); +} + +void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { + // Lower CopyFromReg from a 128-bit regs to two 64-bit regs + // Dst:i128, Src:i128 + // + // {lo, hi} = CopyFromReg Src + // + // ==> + // + // {lo, hi} = I128toV2I64 Src + // + SDValue Ch = N->getOperand(0); + SDValue Src = N->getOperand(1); + SDValue Glue = N->getOperand(2); + SDLoc DL(N); + + // Add Glue and Ch to the operands and results to avoid break the execution order + SDNode *Mov = CurDAG->getMachineNode( + NVPTX::I128toV2I64, DL, + {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()}, + {Src, Ch, Glue}); + + ReplaceNode(N, Mov); +} + /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a /// conversion from \p SrcTy to \p DestTy. unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c5524351f2ff9..49626d4051485 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -74,7 +74,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool SelectSETP_F16X2(SDNode *N); bool SelectSETP_BF16X2(SDNode *N); bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); - + void SelectV2I64toI128(SDNode *N); + void SelectI128toV2I64(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 476a532db0a37..b6d6bb649aad2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -859,6 +859,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); } + // Custom lowering for inline asm with 128-bit operands + setOperationAction(ISD::CopyToReg, MVT::i128, Custom); + setOperationAction(ISD::CopyFromReg, MVT::i128, Custom); + // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. // No FPOW or FREM in PTX. @@ -2804,6 +2808,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVectorArith(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::CopyToReg: + return LowerCopyToReg_128(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -3094,6 +3100,53 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { return Result; } +SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, + SelectionDAG &DAG) const { + // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit + // operand so that it can pass the legalization. + + assert(Op.getOperand(1).getValueType() == MVT::i128 && + "Custom lowering for 128-bit CopyToReg only"); + + SDNode *Node = Op.getNode(); + SDLoc DL(Node); + + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2)); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, + DAG.getIntPtrConstant(1, DL)); + + SmallVector NewOps(Op->getNumOperands() + 1); + SmallVector ResultsType(Node->value_begin(), Node->value_end()); + + NewOps[0] = Op->getOperand(0); // Chain + NewOps[1] = Op->getOperand(1); // Dst Reg + NewOps[2] = Lo; // Lower 64-bit + NewOps[3] = Hi; // Higher 64-bit + if (Op.getNumOperands() == 4) + NewOps[4] = Op->getOperand(3); // Glue if exists + + return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps); +} + +unsigned NVPTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT = std::nullopt) const { + if(VT == MVT::i128 && RegisterVT == MVT::i128) + return 1; + return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); +} + +bool NVPTXTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, std::optional CC) const { + if (Val.getValueType() == MVT::i128 && NumParts == 1) { + Parts[0] = Val; + return true; + } + return false; +} + // This creates target external symbol for a function parameter. // Name of the symbol is composed from its index and the function name. // Negative index corresponds to special parameter (unsized array) used for @@ -5150,6 +5203,7 @@ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { case 'l': case 'f': case 'd': + case 'q': case '0': case 'N': return C_RegisterClass; @@ -5175,6 +5229,12 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'l': case 'N': return std::make_pair(0U, &NVPTX::Int64RegsRegClass); + case 'q': { + if (STI.getSmVersion() < 70) + report_fatal_error("Inline asm with 128 bit operands is only " + "supported for sm_70 and higher!"); + return std::make_pair(0U, &NVPTX::Int128RegsRegClass); + } case 'f': return std::make_pair(0U, &NVPTX::Float32RegsRegClass); case 'd': @@ -6261,6 +6321,37 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, } } +static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results) { + // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit result + // so that it can pass the legalization + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + SDValue Glue = N->getOperand(2); + + assert(Reg.getValueType() == MVT::i128 && + "Custom lowering for CopyFromReg with 128-bit reg only"); + SmallVector ResultsType(4); + SmallVector NewOps(3); + ResultsType[0] = MVT::i64; + ResultsType[1] = MVT::i64; + ResultsType[2] = N->getValueType(1); + ResultsType[3] = N->getValueType(2); + + NewOps[0] = Chain; + NewOps[1] = Reg; + NewOps[2] = Glue; + + SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + {NewValue.getValue(0), NewValue.getValue(1)}); + + Results.push_back(Pair); + Results.push_back(NewValue.getValue(2)); + Results.push_back(NewValue.getValue(3)); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6272,6 +6363,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); return; + case ISD::CopyFromReg: + ReplaceCopyFromReg_128(N, DAG, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index e211286fcc556..63262961b363e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -640,6 +640,14 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCopyToReg_128(SDValue Op, SelectionDAG &DAG) const; + unsigned getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT) const override; + bool + splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, + SDValue *Parts, unsigned NumParts, MVT PartVT, + std::optional CC) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index b0d792b5ee3fe..673858f92e7ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -51,6 +51,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (DestRC == &NVPTX::Int64RegsRegClass) { Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr : NVPTX::BITCONVERT_64_F2I); + } else if (DestRC == &NVPTX::Int128RegsRegClass) { + Op = NVPTX::IMOV128rr; } else if (DestRC == &NVPTX::Float32RegsRegClass) { Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr : NVPTX::BITCONVERT_32_I2F); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index c4c35a1f74ba9..827febe845a4c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2097,6 +2097,8 @@ let IsSimpleMove=1, hasSideEffects=0 in { "mov.u32 \t$dst, $sss;", []>; def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), "mov.u64 \t$dst, $sss;", []>; + def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), + "mov.b128 \t$dst, $sss;", []>; def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), "mov.b16 \t$dst, $sss;", []>; @@ -3545,6 +3547,9 @@ let hasSideEffects = false in { def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$s1, Int32Regs:$s2), "mov.b64 \t$d, {{$s1, $s2}};", []>; + def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d), + (ins Int64Regs:$s1, Int64Regs:$s2), + "mov.b128 \t$d, {{$s1, $s2}};", []>; def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$s1, Float32Regs:$s2), "mov.b64 \t$d, {{$s1, $s2}};", []>; @@ -3560,6 +3565,9 @@ let hasSideEffects = false in { def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), (ins Int64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; + def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2), + (ins Int128Regs:$s), + "mov.b128 \t{{$d1, $d2}}, $s;", []>; def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), (ins Float64Regs:$s), "mov.b64 \t{{$d1, $d2}}, $s;", []>; @@ -3629,7 +3637,7 @@ def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; +def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the // result back to 16-bits if necessary. We also need to subtract 16 because @@ -3667,7 +3675,7 @@ def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; // pattern that avoids the type conversion if we're truncating the result to // i32 anyway. def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; +def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>; // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. // If we know that we're storing into an i32, we can avoid the final trunc. diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index f1213f030bba7..a8a23f04c1249 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -31,6 +31,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) { return ".f32"; if (RC == &NVPTX::Float64RegsRegClass) return ".f64"; + if (RC == &NVPTX::Int128RegsRegClass) + return ".b128"; if (RC == &NVPTX::Int64RegsRegClass) // We use untyped (.b) integer registers here as NVCC does. // Correctness of generated code does not depend on register type, @@ -67,6 +69,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; if (RC == &NVPTX::Float64RegsRegClass) return "%fd"; + if (RC == &NVPTX::Int128RegsRegClass) + return "%rq"; if (RC == &NVPTX::Int64RegsRegClass) return "%rd"; if (RC == &NVPTX::Int32RegsRegClass) diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index b5231a9cf67f9..2011f0f7e328f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -37,6 +37,7 @@ foreach i = 0...4 in { def RS#i : NVPTXReg<"%rs"#i>; // 16-bit def R#i : NVPTXReg<"%r"#i>; // 32-bit def RL#i : NVPTXReg<"%rd"#i>; // 64-bit + def RQ#i : NVPTXReg<"%rq"#i>; // 128-bit def H#i : NVPTXReg<"%h"#i>; // 16-bit float def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float def F#i : NVPTXReg<"%f"#i>; // 32-bit float @@ -62,6 +63,8 @@ def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; +// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. +def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>; def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll new file mode 100644 index 0000000000000..dec0451c34ccc --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -0,0 +1,92 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +@value = internal addrspace(1) global i128 0, align 16 +@llvm.used = appending global [6 x ptr] [ptr @_Z7kernel1v, ptr @_Z7kernel2Pn, ptr @_Z7kernel3Pb, ptr @_Z7kernel4v, ptr @_Z7kernel5Pn, ptr addrspacecast (ptr addrspace(1) @value to ptr)], section "llvm.metadata" + +; Function Attrs: alwaysinline convergent mustprogress willreturn +define void @_Z7kernel1v() #0 { + ; CHECK-LABEL: _Z7kernel1v + ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; + ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42; + ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; + ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } + + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) #3 + ret void +} + +; Function Attrs: alwaysinline convergent mustprogress willreturn +define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 { + ; CHECK-LABEL: _Z7kernel2Pn + ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; + ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; + ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; + ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } + + %1 = addrspacecast ptr %data to ptr addrspace(1) + %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 + %3 = bitcast <2 x i64> %2 to i128 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) #3 + ret void +} + +; Function Attrs: alwaysinline convergent mustprogress willreturn +define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 { + ; CHECK-LABEL: _Z7kernel3Pb + ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}}; + ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; + ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; + ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } + + %1 = addrspacecast ptr %flag to ptr addrspace(1) + %tmp1 = load i8, ptr addrspace(1) %1, align 1 + %tobool.not = icmp eq i8 %tmp1, 0 + %. = select i1 %tobool.not, i128 24, i128 42 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) #3 + ret void +} + +; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) +define void @_Z7kernel4v() #1 { + ; CHECK-LABEL: _Z7kernel4v + ; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; } + ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]]; + + %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() #4 + %add = add nsw i128 %1, 1 + %2 = bitcast i128 %add to <2 x i64> + store <2 x i64> %2, ptr addrspace(1) @value, align 16 + ret void +} + +; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) +define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 { + ; CHECK-LABEL: _Z7kernel5Pn + ; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}]; + ; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]}; + ; CHECK-O3: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; } + ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]]; + + %1 = addrspacecast ptr %data to ptr addrspace(1) + %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 + %3 = bitcast <2 x i64> %2 to i128 + %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) #4 + %add = add nsw i128 %4, 1 + %5 = bitcast i128 %add to <2 x i64> + store <2 x i64> %5, ptr addrspace(1) @value, align 16 + ret void +} + +attributes #0 = { alwaysinline convergent mustprogress willreturn "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } +attributes #1 = { alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } +attributes #2 = { alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } +attributes #3 = { convergent nounwind } +attributes #4 = { nounwind } + + +!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} + +!0 = !{i32 2, i32 0, i32 3, i32 1} +!1 = !{i32 2, i32 0} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll new file mode 100644 index 0000000000000..337479a06c3f0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +@u128_max = internal addrspace(1) global i128 0, align 16 +@u128_zero = internal addrspace(1) global i128 0, align 16 +@i128_max = internal addrspace(1) global i128 0, align 16 +@i128_min = internal addrspace(1) global i128 0, align 16 +@v_u128_max = internal addrspace(1) global i128 0, align 16 +@v_u128_zero = internal addrspace(1) global i128 0, align 16 +@v_i128_max = internal addrspace(1) global i128 0, align 16 +@v_i128_min = internal addrspace(1) global i128 0, align 16 +@v64 = internal addrspace(1) global i64* null, align 8 +@llvm.used = appending global [10 x i8*] [i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_min to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*) to i8*), i8* bitcast (i64** addrspacecast (i64* addrspace(1)* @v64 to i64**) to i8*), i8* bitcast (void ()* @_Z6kernelv to i8*)], section "llvm.metadata" + +; Function Attrs: alwaysinline +define void @_Z6kernelv() #0 { + ; CHECK-LABLE: _Z6kernelv + ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1; + ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]}; + ; CHECK: mov.u64 [[I128_MAX_HI:%rd[0-9]+]], 9223372036854775807; + ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I128_MAX_HI]]}; + ; CHECK: mov.u64 [[I128_MIN_HI:%rd[0-9]+]], -9223372036854775808; + ; CHECK: mov.u64 [[ZERO:%rd[0-9]+]], 0; + ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[ZERO]], [[I128_MIN_HI]]}; + ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[ZERO]], [[ZERO]]}; + + %tmp = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr = getelementptr inbounds i64, i64* %tmp, i32 0 + %tmp1 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %tmp1, i32 1 + call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, i64* %add.ptr, i64* %add.ptr2, i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*)) #1 + %tmp3 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr4 = getelementptr inbounds i64, i64* %tmp3, i32 2 + %tmp5 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr6 = getelementptr inbounds i64, i64* %tmp5, i32 3 + call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, i64* %add.ptr4, i64* %add.ptr6, i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*)) #1 + %tmp7 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr8 = getelementptr inbounds i64, i64* %tmp7, i32 4 + %tmp9 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr10 = getelementptr inbounds i64, i64* %tmp9, i32 5 + call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, i64* %add.ptr8, i64* %add.ptr10, i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*)) #1 + %tmp11 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr12 = getelementptr inbounds i64, i64* %tmp11, i32 6 + %tmp13 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 + %add.ptr14 = getelementptr inbounds i64, i64* %tmp13, i32 7 + call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, i64* %add.ptr12, i64* %add.ptr14, i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*)) #1 + ret void +} + +attributes #0 = { alwaysinline "nvvm.annotations_transplanted" "nvvm.kernel" } +attributes #1 = { nounwind } + +!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} + +!0 = !{i32 2, i32 0, i32 3, i32 1} +!1 = !{i32 2, i32 0} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll new file mode 100644 index 0000000000000..4f077ec5383c9 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -0,0 +1,179 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +@size = internal addrspace(1) global i32 0, align 4 +@value = internal addrspace(1) global i128 0, align 16 +@x = internal addrspace(1) global i128 0, align 16 +@y = internal addrspace(1) global i128 0, align 16 +@z = internal addrspace(1) global i128 0, align 16 +@llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata" + +; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) +define void @_Z6kernelv() #0 { + ; CHECK-LABEL: _Z6kernelv + ; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: mov.b128 [[Z:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: mov.b128 {lo, hi}, [[X]]; + ; CHECK: mov.b128 [[X]], {lo, hi}; + ; CHECK: mov.b128 {lo, hi}, [[Y]]; + ; CHECK: mov.b128 [[Y]], {lo, hi}; + ; CHECK: mov.b128 {lo, hi}, [[Z]]; + ; CHECK: mov.b128 [[Z]], {lo, hi}; + ; CHECK: mov.b128 {[[X_LO:%rd[0-9]+]], [[X_HI:%rd[0-9]+]]}, [[X]]; + ; CHECK: mov.b128 {[[Y_LO:%rd[0-9]+]], [[Y_HI:%rd[0-9]+]]}, [[Y]]; + ; CHECK: mov.b128 {[[Z_LO:%rd[0-9]+]], [[Z_HI:%rd[0-9]+]]}, [[Z]]; + ; CHECK: mov.b128 [[X_NEW:%rq[0-9]+]], {[[X_LO]], [[X_HI]]}; + ; CHECK: mov.b128 [[Y_NEW:%rq[0-9]+]], {[[Y_LO]], [[Y_HI]]}; + ; CHECK: mov.b128 [[Z_NEW:%rq[0-9]+]], {[[Z_LO]], [[Z_HI]]}; + ; CHECK: mov.b128 {lo, hi}, [[X_NEW]]; + ; CHECK: mov.b128 [[X_NEW]], {lo, hi}; + ; CHECK: mov.b128 {lo, hi}, [[Y_NEW]]; + ; CHECK: mov.b128 [[Y_NEW]], {lo, hi}; + ; CHECK: mov.b128 {lo, hi}, [[Z_NEW]]; + ; CHECK: mov.b128 [[Z_NEW]], {lo, hi}; + ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[X_NEW]]; + ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Y_NEW]]; + ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Z_NEW]]; + + %tmp11 = load i32, ptr addrspace(1) @size, align 4 + %cmp3.not = icmp eq i32 %tmp11, 0 + br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader + +.lr.ph.preheader: ; preds = %0 + %x.promoted5 = load i128, ptr addrspace(1) @x, align 16 + %y.promoted6 = load i128, ptr addrspace(1) @y, align 16 + %z.promoted7 = load i128, ptr addrspace(1) @z, align 16 + %value.promoted8 = load i128, ptr addrspace(1) @value, align 16 + %umax = sext i32 %tmp11 to i64 + %xtraiter = and i64 %umax, 3 + %1 = icmp ult i32 %tmp11, 4 + br i1 %1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph.preheader.new + +.lr.ph.preheader.new: ; preds = %.lr.ph.preheader + %unroll_iter = and i64 %umax, -4 + br label %.lr.ph + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.new + %2 = phi i128 [ %value.promoted8, %.lr.ph.preheader.new ], [ %add14.3, %.lr.ph ] + %3 = phi i128 [ %z.promoted7, %.lr.ph.preheader.new ], [ %asmresult21.3, %.lr.ph ] + %4 = phi i128 [ %y.promoted6, %.lr.ph.preheader.new ], [ %asmresult20.3, %.lr.ph ] + %5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ] + %i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ] + %niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ] + %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) #1 + %asmresult = extractvalue { i128, i128, i128 } %6, 0 + %asmresult7 = extractvalue { i128, i128, i128 } %6, 1 + %asmresult8 = extractvalue { i128, i128, i128 } %6, 2 + %add = add nsw i128 %asmresult, %asmresult7 + %add12 = add nsw i128 %add, %asmresult8 + %add14 = add nsw i128 %add12, %2 + %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) #1 + %asmresult19 = extractvalue { i128, i128, i128 } %7, 0 + %asmresult20 = extractvalue { i128, i128, i128 } %7, 1 + %asmresult21 = extractvalue { i128, i128, i128 } %7, 2 + %inc = add nuw nsw i64 %i.04, 1 + %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) #1 + %asmresult.1 = extractvalue { i128, i128, i128 } %8, 0 + %asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1 + %asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2 + %add.1 = add nsw i128 %asmresult.1, %asmresult7.1 + %add12.1 = add nsw i128 %add.1, %asmresult8.1 + %add14.1 = add nsw i128 %add12.1, %add14 + %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) #1 + %asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0 + %asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1 + %asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2 + %inc.1 = add nuw nsw i64 %i.04, 2 + %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) #1 + %asmresult.2 = extractvalue { i128, i128, i128 } %10, 0 + %asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1 + %asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2 + %add.2 = add nsw i128 %asmresult.2, %asmresult7.2 + %add12.2 = add nsw i128 %add.2, %asmresult8.2 + %add14.2 = add nsw i128 %add12.2, %add14.1 + %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) #1 + %asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0 + %asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1 + %asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2 + %inc.2 = add nuw nsw i64 %i.04, 3 + %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) #1 + %asmresult.3 = extractvalue { i128, i128, i128 } %12, 0 + %asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1 + %asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2 + %add.3 = add nsw i128 %asmresult.3, %asmresult7.3 + %add12.3 = add nsw i128 %add.3, %asmresult8.3 + %add14.3 = add nsw i128 %add12.3, %add14.2 + %13 = bitcast i128 %add14.3 to <2 x i64> + store <2 x i64> %13, ptr addrspace(1) @value, align 16 + %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) #1 + %asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0 + %asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1 + %asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2 + %15 = bitcast i128 %asmresult19.3 to <2 x i64> + store <2 x i64> %15, ptr addrspace(1) @x, align 16 + %16 = bitcast i128 %asmresult20.3 to <2 x i64> + store <2 x i64> %16, ptr addrspace(1) @y, align 16 + %17 = bitcast i128 %asmresult21.3 to <2 x i64> + store <2 x i64> %17, ptr addrspace(1) @z, align 16 + %inc.3 = add nuw i64 %i.04, 4 + %niter.next.3 = add i64 %niter, 4 + %niter.ncmp.3.not = icmp eq i64 %niter.next.3, %unroll_iter + br i1 %niter.ncmp.3.not, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph, !llvm.loop !2 + +._crit_edge.loopexit.unr-lcssa: ; preds = %.lr.ph, %.lr.ph.preheader + %.unr = phi i128 [ %value.promoted8, %.lr.ph.preheader ], [ %add14.3, %.lr.ph ] + %.unr9 = phi i128 [ %z.promoted7, %.lr.ph.preheader ], [ %asmresult21.3, %.lr.ph ] + %.unr10 = phi i128 [ %y.promoted6, %.lr.ph.preheader ], [ %asmresult20.3, %.lr.ph ] + %.unr11 = phi i128 [ %x.promoted5, %.lr.ph.preheader ], [ %asmresult19.3, %.lr.ph ] + %i.04.unr = phi i64 [ 0, %.lr.ph.preheader ], [ %inc.3, %.lr.ph ] + %lcmp.mod.not = icmp eq i64 %xtraiter, 0 + br i1 %lcmp.mod.not, label %._crit_edge, label %.lr.ph.epil + +.lr.ph.epil: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa + %18 = phi i128 [ %add14.epil, %.lr.ph.epil ], [ %.unr, %._crit_edge.loopexit.unr-lcssa ] + %19 = phi i128 [ %asmresult21.epil, %.lr.ph.epil ], [ %.unr9, %._crit_edge.loopexit.unr-lcssa ] + %20 = phi i128 [ %asmresult20.epil, %.lr.ph.epil ], [ %.unr10, %._crit_edge.loopexit.unr-lcssa ] + %21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ] + %i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ] + %epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ] + %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) #1 + %asmresult.epil = extractvalue { i128, i128, i128 } %22, 0 + %asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1 + %asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2 + %add.epil = add nsw i128 %asmresult.epil, %asmresult7.epil + %add12.epil = add nsw i128 %add.epil, %asmresult8.epil + %add14.epil = add nsw i128 %add12.epil, %18 + %23 = bitcast i128 %add14.epil to <2 x i64> + store <2 x i64> %23, ptr addrspace(1) @value, align 16 + %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) #1 + %asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0 + %asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1 + %asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2 + %25 = bitcast i128 %asmresult19.epil to <2 x i64> + store <2 x i64> %25, ptr addrspace(1) @x, align 16 + %26 = bitcast i128 %asmresult20.epil to <2 x i64> + store <2 x i64> %26, ptr addrspace(1) @y, align 16 + %27 = bitcast i128 %asmresult21.epil to <2 x i64> + store <2 x i64> %27, ptr addrspace(1) @z, align 16 + %inc.epil = add nuw i64 %i.04.epil, 1 + %epil.iter.next = add i64 %epil.iter, 1 + %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter + br i1 %epil.iter.cmp.not, label %._crit_edge, label %.lr.ph.epil, !llvm.loop !4 + +._crit_edge: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa, %0 + ret void +} + +attributes #0 = { alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } +attributes #1 = { nounwind } + +!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} + +!0 = !{i32 2, i32 0, i32 3, i32 1} +!1 = !{i32 2, i32 0} +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.mustprogress"} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} From 3c95fe4097059bde002a5d3f3451da3e1fa05de5 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Tue, 25 Jun 2024 21:54:48 +0000 Subject: [PATCH 2/6] Format Code & Update tests --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 17 ++--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 23 +++--- .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 28 +++----- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 70 ++++++++----------- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 24 +++---- 5 files changed, 73 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 05706e200bda6..9c0498560db21 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -527,7 +527,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { break; } case ISD::CopyFromReg: { - if(N->getOperand(1).getValueType() == MVT::i128){ + if (N->getOperand(1).getValueType() == MVT::i128) { SelectI128toV2I64(N); return; } @@ -3825,17 +3825,17 @@ void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { SDValue Dst = N->getOperand(1); SDValue Lo = N->getOperand(2); SDValue Hi = N->getOperand(3); - + SDLoc DL(N); SDNode *Mov = CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi}); - + SmallVector ResultsType(N->value_begin(), N->value_end()); SmallVector NewOps(N->getNumOperands() - 1); NewOps[0] = N->getOperand(0); NewOps[1] = Dst; NewOps[2] = SDValue(Mov, 0); - if (N->getNumOperands() == 5) + if (N->getNumOperands() == 5) NewOps[3] = N->getOperand(4); SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps); @@ -3847,17 +3847,18 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { // Dst:i128, Src:i128 // // {lo, hi} = CopyFromReg Src - // + // // ==> - // + // // {lo, hi} = I128toV2I64 Src - // + // SDValue Ch = N->getOperand(0); SDValue Src = N->getOperand(1); SDValue Glue = N->getOperand(2); SDLoc DL(N); - // Add Glue and Ch to the operands and results to avoid break the execution order + // Add Glue and Ch to the operands and results to avoid break the execution + // order SDNode *Mov = CurDAG->getMachineNode( NVPTX::I128toV2I64, DL, {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()}, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b6d6bb649aad2..5449e35149a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3107,15 +3107,15 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, assert(Op.getOperand(1).getValueType() == MVT::i128 && "Custom lowering for 128-bit CopyToReg only"); - + SDNode *Node = Op.getNode(); SDLoc DL(Node); SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2)); SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, - DAG.getIntPtrConstant(0, DL)); + DAG.getIntPtrConstant(0, DL)); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, - DAG.getIntPtrConstant(1, DL)); + DAG.getIntPtrConstant(1, DL)); SmallVector NewOps(Op->getNumOperands() + 1); SmallVector ResultsType(Node->value_begin(), Node->value_end()); @@ -3130,11 +3130,12 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps); } -unsigned NVPTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT, - std::optional RegisterVT = std::nullopt) const { - if(VT == MVT::i128 && RegisterVT == MVT::i128) - return 1; - return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); +unsigned NVPTXTargetLowering::getNumRegisters( + LLVMContext &Context, EVT VT, + std::optional RegisterVT = std::nullopt) const { + if (VT == MVT::i128 && RegisterVT == MVT::i128) + return 1; + return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); } bool NVPTXTargetLowering::splitValueIntoRegisterParts( @@ -5203,7 +5204,7 @@ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { case 'l': case 'f': case 'd': - case 'q': + case 'q': case '0': case 'N': return C_RegisterClass; @@ -6323,8 +6324,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { - // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit result - // so that it can pass the legalization + // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit + // result so that it can pass the legalization SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Reg = N->getOperand(1); diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index dec0451c34ccc..8b5369d2804b0 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -3,22 +3,21 @@ target triple = "nvptx64-nvidia-cuda" @value = internal addrspace(1) global i128 0, align 16 -@llvm.used = appending global [6 x ptr] [ptr @_Z7kernel1v, ptr @_Z7kernel2Pn, ptr @_Z7kernel3Pb, ptr @_Z7kernel4v, ptr @_Z7kernel5Pn, ptr addrspacecast (ptr addrspace(1) @value to ptr)], section "llvm.metadata" ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel1v() #0 { +define void @_Z7kernel1v() { ; CHECK-LABEL: _Z7kernel1v ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42; ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } - tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) #3 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) ret void } ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 { +define void @_Z7kernel2Pn(ptr nocapture readonly %data) { ; CHECK-LABEL: _Z7kernel2Pn ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; @@ -28,12 +27,12 @@ define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 { %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 %3 = bitcast <2 x i64> %2 to i128 - tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) #3 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) ret void } ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 { +define void @_Z7kernel3Pb(ptr nocapture readonly %flag) { ; CHECK-LABEL: _Z7kernel3Pb ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}}; ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; @@ -44,17 +43,17 @@ define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 { %tmp1 = load i8, ptr addrspace(1) %1, align 1 %tobool.not = icmp eq i8 %tmp1, 0 %. = select i1 %tobool.not, i128 24, i128 42 - tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) #3 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) ret void } ; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) -define void @_Z7kernel4v() #1 { +define void @_Z7kernel4v() { ; CHECK-LABEL: _Z7kernel4v ; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; } ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]]; - %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() #4 + %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() %add = add nsw i128 %1, 1 %2 = bitcast i128 %add to <2 x i64> store <2 x i64> %2, ptr addrspace(1) @value, align 16 @@ -62,7 +61,7 @@ define void @_Z7kernel4v() #1 { } ; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) -define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 { +define void @_Z7kernel5Pn(ptr nocapture readonly %data) { ; CHECK-LABEL: _Z7kernel5Pn ; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}]; ; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]}; @@ -72,20 +71,13 @@ define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 { %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 %3 = bitcast <2 x i64> %2 to i128 - %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) #4 + %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) %add = add nsw i128 %4, 1 %5 = bitcast i128 %add to <2 x i64> store <2 x i64> %5, ptr addrspace(1) @value, align 16 ret void } -attributes #0 = { alwaysinline convergent mustprogress willreturn "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } -attributes #1 = { alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } -attributes #2 = { alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } -attributes #3 = { convergent nounwind } -attributes #4 = { nounwind } - - !nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} !0 = !{i32 2, i32 0, i32 3, i32 1} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 337479a06c3f0..94b641e8faf05 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -10,48 +10,40 @@ target triple = "nvptx64-nvidia-cuda" @v_u128_zero = internal addrspace(1) global i128 0, align 16 @v_i128_max = internal addrspace(1) global i128 0, align 16 @v_i128_min = internal addrspace(1) global i128 0, align 16 -@v64 = internal addrspace(1) global i64* null, align 8 -@llvm.used = appending global [10 x i8*] [i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_min to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*) to i8*), i8* bitcast (i64** addrspacecast (i64* addrspace(1)* @v64 to i64**) to i8*), i8* bitcast (void ()* @_Z6kernelv to i8*)], section "llvm.metadata" +@v64 = internal addrspace(1) global ptr null, align 8 -; Function Attrs: alwaysinline -define void @_Z6kernelv() #0 { - ; CHECK-LABLE: _Z6kernelv - ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1; - ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]}; - ; CHECK: mov.u64 [[I128_MAX_HI:%rd[0-9]+]], 9223372036854775807; - ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I128_MAX_HI]]}; - ; CHECK: mov.u64 [[I128_MIN_HI:%rd[0-9]+]], -9223372036854775808; - ; CHECK: mov.u64 [[ZERO:%rd[0-9]+]], 0; - ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[ZERO]], [[I128_MIN_HI]]}; - ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[ZERO]], [[ZERO]]}; - - %tmp = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr = getelementptr inbounds i64, i64* %tmp, i32 0 - %tmp1 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr2 = getelementptr inbounds i64, i64* %tmp1, i32 1 - call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, i64* %add.ptr, i64* %add.ptr2, i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*)) #1 - %tmp3 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr4 = getelementptr inbounds i64, i64* %tmp3, i32 2 - %tmp5 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr6 = getelementptr inbounds i64, i64* %tmp5, i32 3 - call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, i64* %add.ptr4, i64* %add.ptr6, i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*)) #1 - %tmp7 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr8 = getelementptr inbounds i64, i64* %tmp7, i32 4 - %tmp9 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr10 = getelementptr inbounds i64, i64* %tmp9, i32 5 - call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, i64* %add.ptr8, i64* %add.ptr10, i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*)) #1 - %tmp11 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr12 = getelementptr inbounds i64, i64* %tmp11, i32 6 - %tmp13 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8 - %add.ptr14 = getelementptr inbounds i64, i64* %tmp13, i32 7 - call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, i64* %add.ptr12, i64* %add.ptr14, i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*)) #1 +; Function Attrs: alwaysinline convergent mustprogress willreturn +define void @_Z6kernelv() { + ; CHECK-LABEL: _Z6kernelv + ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1; + ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]}; + ; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807; + ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I64_MAX]]} + ; CHECK: mov.u64 [[I64_MIN:%rd[0-9]+]], -9223372036854775808; + ; CHECK: mov.u64 [[U64_ZERO:%rd[0-9]+]], 0; + ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[U64_ZERO]], [[I64_MIN]]} + ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[U64_ZERO]], [[U64_ZERO]]} + + %tmp = load ptr, ptr addrspace(1) @v64, align 8 + %add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) + %tmp3 = load ptr, ptr addrspace(1) @v64, align 8 + %add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2 + %add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) + %tmp7 = load ptr, ptr addrspace(1) @v64, align 8 + %add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4 + %add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) + %tmp11 = load ptr, ptr addrspace(1) @v64, align 8 + %add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6 + %add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) ret void } -attributes #0 = { alwaysinline "nvvm.annotations_transplanted" "nvvm.kernel" } -attributes #1 = { nounwind } -!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} +!nvvmir.version = !{!2, !3, !2, !3, !3, !2, !2, !2, !3} -!0 = !{i32 2, i32 0, i32 3, i32 1} -!1 = !{i32 2, i32 0} +!2 = !{i32 2, i32 0, i32 3, i32 1} +!3 = !{i32 2, i32 0} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 4f077ec5383c9..9d7a25ca4d467 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -10,7 +10,7 @@ target triple = "nvptx64-nvidia-cuda" @llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata" ; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) -define void @_Z6kernelv() #0 { +define void @_Z6kernelv() { ; CHECK-LABEL: _Z6kernelv ; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; @@ -62,43 +62,43 @@ define void @_Z6kernelv() #0 { %5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ] %i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ] %niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ] - %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) #1 + %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) %asmresult = extractvalue { i128, i128, i128 } %6, 0 %asmresult7 = extractvalue { i128, i128, i128 } %6, 1 %asmresult8 = extractvalue { i128, i128, i128 } %6, 2 %add = add nsw i128 %asmresult, %asmresult7 %add12 = add nsw i128 %add, %asmresult8 %add14 = add nsw i128 %add12, %2 - %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) #1 + %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) %asmresult19 = extractvalue { i128, i128, i128 } %7, 0 %asmresult20 = extractvalue { i128, i128, i128 } %7, 1 %asmresult21 = extractvalue { i128, i128, i128 } %7, 2 %inc = add nuw nsw i64 %i.04, 1 - %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) #1 + %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) %asmresult.1 = extractvalue { i128, i128, i128 } %8, 0 %asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1 %asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2 %add.1 = add nsw i128 %asmresult.1, %asmresult7.1 %add12.1 = add nsw i128 %add.1, %asmresult8.1 %add14.1 = add nsw i128 %add12.1, %add14 - %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) #1 + %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) %asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0 %asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1 %asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2 %inc.1 = add nuw nsw i64 %i.04, 2 - %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) #1 + %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) %asmresult.2 = extractvalue { i128, i128, i128 } %10, 0 %asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1 %asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2 %add.2 = add nsw i128 %asmresult.2, %asmresult7.2 %add12.2 = add nsw i128 %add.2, %asmresult8.2 %add14.2 = add nsw i128 %add12.2, %add14.1 - %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) #1 + %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) %asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0 %asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1 %asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2 %inc.2 = add nuw nsw i64 %i.04, 3 - %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) #1 + %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) %asmresult.3 = extractvalue { i128, i128, i128 } %12, 0 %asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1 %asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2 @@ -107,7 +107,7 @@ define void @_Z6kernelv() #0 { %add14.3 = add nsw i128 %add12.3, %add14.2 %13 = bitcast i128 %add14.3 to <2 x i64> store <2 x i64> %13, ptr addrspace(1) @value, align 16 - %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) #1 + %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) %asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0 %asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1 %asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2 @@ -138,7 +138,7 @@ define void @_Z6kernelv() #0 { %21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ] %i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ] %epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ] - %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) #1 + %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) %asmresult.epil = extractvalue { i128, i128, i128 } %22, 0 %asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1 %asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2 @@ -147,7 +147,7 @@ define void @_Z6kernelv() #0 { %add14.epil = add nsw i128 %add12.epil, %18 %23 = bitcast i128 %add14.epil to <2 x i64> store <2 x i64> %23, ptr addrspace(1) @value, align 16 - %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) #1 + %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) %asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0 %asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1 %asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2 @@ -166,8 +166,6 @@ define void @_Z6kernelv() #0 { ret void } -attributes #0 = { alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" } -attributes #1 = { nounwind } !nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} From 9d28385e1b445302d84a9313232adcf544010191 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 26 Jun 2024 00:04:10 +0000 Subject: [PATCH 3/6] Update kernel names in tests & Update one test for inline asm in loops --- .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 36 ++-- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 7 +- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 177 +++--------------- 3 files changed, 45 insertions(+), 175 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 8b5369d2804b0..8a256d50d6050 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @value = internal addrspace(1) global i128 0, align 16 ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel1v() { - ; CHECK-LABEL: _Z7kernel1v +define void @test_b128_input_from_const() { + ; CHECK-LABEL: test_b128_input_from_const ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42; ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; @@ -17,8 +18,8 @@ define void @_Z7kernel1v() { } ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel2Pn(ptr nocapture readonly %data) { - ; CHECK-LABEL: _Z7kernel2Pn +define void @test_b128_input_from_load(ptr nocapture readonly %data) { + ; CHECK-LABEL: test_b128_input_from_load ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; @@ -32,8 +33,8 @@ define void @_Z7kernel2Pn(ptr nocapture readonly %data) { } ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z7kernel3Pb(ptr nocapture readonly %flag) { - ; CHECK-LABEL: _Z7kernel3Pb +define void @test_b128_input_from_select(ptr nocapture readonly %flag) { + ; CHECK-LABEL: test_b128_input_from_select ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}}; ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; @@ -48,10 +49,10 @@ define void @_Z7kernel3Pb(ptr nocapture readonly %flag) { } ; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) -define void @_Z7kernel4v() { - ; CHECK-LABEL: _Z7kernel4v - ; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; } - ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]]; +define void @test_store_b128_output() { + ; CHECK-LABEL: test_store_b128_output + ; CHECK: { mov.b128 [[REG_128:%rq[0-9]+]], 41; } + ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]]; %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() %add = add nsw i128 %1, 1 @@ -61,12 +62,13 @@ define void @_Z7kernel4v() { } ; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) -define void @_Z7kernel5Pn(ptr nocapture readonly %data) { - ; CHECK-LABEL: _Z7kernel5Pn - ; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}]; - ; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]}; - ; CHECK-O3: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; } - ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]]; +define void @test_use_of_b128_output(ptr nocapture readonly %data) { + ; CHECK-LABEL: test_use_of_b128_output + ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; + ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; + ; CHECK: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; + ; CHECK: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; } + ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]]; %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 94b641e8faf05..09b648d036a4c 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @@ -13,8 +14,8 @@ target triple = "nvptx64-nvidia-cuda" @v64 = internal addrspace(1) global ptr null, align 8 ; Function Attrs: alwaysinline convergent mustprogress willreturn -define void @_Z6kernelv() { - ; CHECK-LABEL: _Z6kernelv +define void @test_corner_values() { + ; CHECK-LABEL: test_corner_values ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1; ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]}; ; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 9d7a25ca4d467..e187aa4370858 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -1,177 +1,44 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} target triple = "nvptx64-nvidia-cuda" @size = internal addrspace(1) global i32 0, align 4 -@value = internal addrspace(1) global i128 0, align 16 @x = internal addrspace(1) global i128 0, align 16 -@y = internal addrspace(1) global i128 0, align 16 -@z = internal addrspace(1) global i128 0, align 16 -@llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata" -; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) -define void @_Z6kernelv() { - ; CHECK-LABEL: _Z6kernelv - ; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: mov.b128 [[Z:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: mov.b128 {lo, hi}, [[X]]; - ; CHECK: mov.b128 [[X]], {lo, hi}; - ; CHECK: mov.b128 {lo, hi}, [[Y]]; - ; CHECK: mov.b128 [[Y]], {lo, hi}; - ; CHECK: mov.b128 {lo, hi}, [[Z]]; - ; CHECK: mov.b128 [[Z]], {lo, hi}; - ; CHECK: mov.b128 {[[X_LO:%rd[0-9]+]], [[X_HI:%rd[0-9]+]]}, [[X]]; - ; CHECK: mov.b128 {[[Y_LO:%rd[0-9]+]], [[Y_HI:%rd[0-9]+]]}, [[Y]]; - ; CHECK: mov.b128 {[[Z_LO:%rd[0-9]+]], [[Z_HI:%rd[0-9]+]]}, [[Z]]; - ; CHECK: mov.b128 [[X_NEW:%rq[0-9]+]], {[[X_LO]], [[X_HI]]}; - ; CHECK: mov.b128 [[Y_NEW:%rq[0-9]+]], {[[Y_LO]], [[Y_HI]]}; - ; CHECK: mov.b128 [[Z_NEW:%rq[0-9]+]], {[[Z_LO]], [[Z_HI]]}; - ; CHECK: mov.b128 {lo, hi}, [[X_NEW]]; - ; CHECK: mov.b128 [[X_NEW]], {lo, hi}; - ; CHECK: mov.b128 {lo, hi}, [[Y_NEW]]; - ; CHECK: mov.b128 [[Y_NEW]], {lo, hi}; - ; CHECK: mov.b128 {lo, hi}, [[Z_NEW]]; - ; CHECK: mov.b128 [[Z_NEW]], {lo, hi}; - ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[X_NEW]]; - ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Y_NEW]]; - ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Z_NEW]]; - +define void @test_b128_in_loop() { + ; CHECK-LABEL: test_b128_in_loop + ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [x+8]; + ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [x]; + ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; + ; CHECK: mov.b128 {lo, hi}, [[REG_128]]; + ; CHECK: add.cc.u64 lo, lo, {{%rd[0-9]+}}; + ; CHECK: mov.b128 [[REG_128]], {lo, hi}; + %tmp11 = load i32, ptr addrspace(1) @size, align 4 %cmp3.not = icmp eq i32 %tmp11, 0 br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader .lr.ph.preheader: ; preds = %0 %x.promoted5 = load i128, ptr addrspace(1) @x, align 16 - %y.promoted6 = load i128, ptr addrspace(1) @y, align 16 - %z.promoted7 = load i128, ptr addrspace(1) @z, align 16 - %value.promoted8 = load i128, ptr addrspace(1) @value, align 16 %umax = sext i32 %tmp11 to i64 - %xtraiter = and i64 %umax, 3 - %1 = icmp ult i32 %tmp11, 4 - br i1 %1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph.preheader.new - -.lr.ph.preheader.new: ; preds = %.lr.ph.preheader - %unroll_iter = and i64 %umax, -4 br label %.lr.ph -.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.new - %2 = phi i128 [ %value.promoted8, %.lr.ph.preheader.new ], [ %add14.3, %.lr.ph ] - %3 = phi i128 [ %z.promoted7, %.lr.ph.preheader.new ], [ %asmresult21.3, %.lr.ph ] - %4 = phi i128 [ %y.promoted6, %.lr.ph.preheader.new ], [ %asmresult20.3, %.lr.ph ] - %5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ] - %i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ] - %niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ] - %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) - %asmresult = extractvalue { i128, i128, i128 } %6, 0 - %asmresult7 = extractvalue { i128, i128, i128 } %6, 1 - %asmresult8 = extractvalue { i128, i128, i128 } %6, 2 - %add = add nsw i128 %asmresult, %asmresult7 - %add12 = add nsw i128 %add, %asmresult8 - %add14 = add nsw i128 %add12, %2 - %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) - %asmresult19 = extractvalue { i128, i128, i128 } %7, 0 - %asmresult20 = extractvalue { i128, i128, i128 } %7, 1 - %asmresult21 = extractvalue { i128, i128, i128 } %7, 2 - %inc = add nuw nsw i64 %i.04, 1 - %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) - %asmresult.1 = extractvalue { i128, i128, i128 } %8, 0 - %asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1 - %asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2 - %add.1 = add nsw i128 %asmresult.1, %asmresult7.1 - %add12.1 = add nsw i128 %add.1, %asmresult8.1 - %add14.1 = add nsw i128 %add12.1, %add14 - %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) - %asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0 - %asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1 - %asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2 - %inc.1 = add nuw nsw i64 %i.04, 2 - %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) - %asmresult.2 = extractvalue { i128, i128, i128 } %10, 0 - %asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1 - %asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2 - %add.2 = add nsw i128 %asmresult.2, %asmresult7.2 - %add12.2 = add nsw i128 %add.2, %asmresult8.2 - %add14.2 = add nsw i128 %add12.2, %add14.1 - %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) - %asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0 - %asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1 - %asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2 - %inc.2 = add nuw nsw i64 %i.04, 3 - %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) - %asmresult.3 = extractvalue { i128, i128, i128 } %12, 0 - %asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1 - %asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2 - %add.3 = add nsw i128 %asmresult.3, %asmresult7.3 - %add12.3 = add nsw i128 %add.3, %asmresult8.3 - %add14.3 = add nsw i128 %add12.3, %add14.2 - %13 = bitcast i128 %add14.3 to <2 x i64> - store <2 x i64> %13, ptr addrspace(1) @value, align 16 - %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) - %asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0 - %asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1 - %asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2 - %15 = bitcast i128 %asmresult19.3 to <2 x i64> - store <2 x i64> %15, ptr addrspace(1) @x, align 16 - %16 = bitcast i128 %asmresult20.3 to <2 x i64> - store <2 x i64> %16, ptr addrspace(1) @y, align 16 - %17 = bitcast i128 %asmresult21.3 to <2 x i64> - store <2 x i64> %17, ptr addrspace(1) @z, align 16 - %inc.3 = add nuw i64 %i.04, 4 - %niter.next.3 = add i64 %niter, 4 - %niter.ncmp.3.not = icmp eq i64 %niter.next.3, %unroll_iter - br i1 %niter.ncmp.3.not, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph, !llvm.loop !2 - -._crit_edge.loopexit.unr-lcssa: ; preds = %.lr.ph, %.lr.ph.preheader - %.unr = phi i128 [ %value.promoted8, %.lr.ph.preheader ], [ %add14.3, %.lr.ph ] - %.unr9 = phi i128 [ %z.promoted7, %.lr.ph.preheader ], [ %asmresult21.3, %.lr.ph ] - %.unr10 = phi i128 [ %y.promoted6, %.lr.ph.preheader ], [ %asmresult20.3, %.lr.ph ] - %.unr11 = phi i128 [ %x.promoted5, %.lr.ph.preheader ], [ %asmresult19.3, %.lr.ph ] - %i.04.unr = phi i64 [ 0, %.lr.ph.preheader ], [ %inc.3, %.lr.ph ] - %lcmp.mod.not = icmp eq i64 %xtraiter, 0 - br i1 %lcmp.mod.not, label %._crit_edge, label %.lr.ph.epil - -.lr.ph.epil: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa - %18 = phi i128 [ %add14.epil, %.lr.ph.epil ], [ %.unr, %._crit_edge.loopexit.unr-lcssa ] - %19 = phi i128 [ %asmresult21.epil, %.lr.ph.epil ], [ %.unr9, %._crit_edge.loopexit.unr-lcssa ] - %20 = phi i128 [ %asmresult20.epil, %.lr.ph.epil ], [ %.unr10, %._crit_edge.loopexit.unr-lcssa ] - %21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ] - %i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ] - %epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ] - %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) - %asmresult.epil = extractvalue { i128, i128, i128 } %22, 0 - %asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1 - %asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2 - %add.epil = add nsw i128 %asmresult.epil, %asmresult7.epil - %add12.epil = add nsw i128 %add.epil, %asmresult8.epil - %add14.epil = add nsw i128 %add12.epil, %18 - %23 = bitcast i128 %add14.epil to <2 x i64> - store <2 x i64> %23, ptr addrspace(1) @value, align 16 - %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) - %asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0 - %asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1 - %asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2 - %25 = bitcast i128 %asmresult19.epil to <2 x i64> - store <2 x i64> %25, ptr addrspace(1) @x, align 16 - %26 = bitcast i128 %asmresult20.epil to <2 x i64> - store <2 x i64> %26, ptr addrspace(1) @y, align 16 - %27 = bitcast i128 %asmresult21.epil to <2 x i64> - store <2 x i64> %27, ptr addrspace(1) @z, align 16 - %inc.epil = add nuw i64 %i.04.epil, 1 - %epil.iter.next = add i64 %epil.iter, 1 - %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter - br i1 %epil.iter.cmp.not, label %._crit_edge, label %.lr.ph.epil, !llvm.loop !4 - -._crit_edge: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa, %0 +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ] + %i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09} \0A\09", "=q,l,0"(i64 %i.04, i128 %1) + %3 = bitcast i128 %2 to <2 x i64> + store <2 x i64> %3, ptr addrspace(1) @x, align 16 + %inc = add nuw i64 %i.04, 1 + %exitcond.not = icmp eq i64 %inc, %umax + br i1 %exitcond.not, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 ret void } - !nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} !0 = !{i32 2, i32 0, i32 3, i32 1} !1 = !{i32 2, i32 0} -!2 = distinct !{!2, !3} -!3 = !{!"llvm.loop.mustprogress"} -!4 = distinct !{!4, !5} -!5 = !{!"llvm.loop.unroll.disable"} From c4bc027557dcfd025522739f67b22d28de14dedb Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 26 Jun 2024 20:47:52 +0000 Subject: [PATCH 4/6] Update testcases with checks generated by update_llc_test_checks.py --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 17 +-- .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 127 +++++++++++++----- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 106 +++++++++++++-- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 44 ++++-- 5 files changed, 232 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 9c0498560db21..11193c11ede3b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -3830,14 +3830,13 @@ void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { SDNode *Mov = CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi}); - SmallVector ResultsType(N->value_begin(), N->value_end()); - SmallVector NewOps(N->getNumOperands() - 1); + SmallVector NewOps(N->getNumOperands() - 1); NewOps[0] = N->getOperand(0); NewOps[1] = Dst; NewOps[2] = SDValue(Mov, 0); if (N->getNumOperands() == 5) NewOps[3] = N->getOperand(4); - SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps); + SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector(N->values()), NewOps); ReplaceNode(N, NewValue.getNode()); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 5449e35149a1e..0bfcba602cb51 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3117,8 +3117,8 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, DAG.getIntPtrConstant(1, DL)); - SmallVector NewOps(Op->getNumOperands() + 1); - SmallVector ResultsType(Node->value_begin(), Node->value_end()); + SmallVector NewOps(Op->getNumOperands() + 1); + SmallVector ResultsType(Node->value_begin(), Node->value_end()); NewOps[0] = Op->getOperand(0); // Chain NewOps[1] = Op->getOperand(1); // Dst Reg @@ -6333,16 +6333,9 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, assert(Reg.getValueType() == MVT::i128 && "Custom lowering for CopyFromReg with 128-bit reg only"); - SmallVector ResultsType(4); - SmallVector NewOps(3); - ResultsType[0] = MVT::i64; - ResultsType[1] = MVT::i64; - ResultsType[2] = N->getValueType(1); - ResultsType[3] = N->getValueType(2); - - NewOps[0] = Chain; - NewOps[1] = Reg; - NewOps[2] = Glue; + SmallVector ResultsType = {MVT::i64, MVT::i64, N->getValueType(1), + N->getValueType(2)}; + SmallVector NewOps = {Chain, Reg, Glue}; SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps); SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 8a256d50d6050..a04ed40dbf91a 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} @@ -5,25 +6,49 @@ target triple = "nvptx64-nvidia-cuda" @value = internal addrspace(1) global i128 0, align 16 -; Function Attrs: alwaysinline convergent mustprogress willreturn define void @test_b128_input_from_const() { - ; CHECK-LABEL: test_b128_input_from_const - ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; - ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42; - ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; - ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } +; CHECK-LABEL: test_b128_input_from_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u64 %rd2, 0; +; CHECK-NEXT: mov.u64 %rd3, 42; +; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; +; CHECK-NEXT: mov.u32 %r1, value; +; CHECK-NEXT: cvta.global.u32 %r2, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) ret void } -; Function Attrs: alwaysinline convergent mustprogress willreturn define void @test_b128_input_from_load(ptr nocapture readonly %data) { - ; CHECK-LABEL: test_b128_input_from_load - ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; - ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; - ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; - ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } +; CHECK-LABEL: test_b128_input_from_load( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_load_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u64 %rd2, [%r2+8]; +; CHECK-NEXT: ld.global.u64 %rd3, [%r2]; +; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2}; +; CHECK-NEXT: mov.u32 %r3, value; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 @@ -32,13 +57,30 @@ define void @test_b128_input_from_load(ptr nocapture readonly %data) { ret void } -; Function Attrs: alwaysinline convergent mustprogress willreturn define void @test_b128_input_from_select(ptr nocapture readonly %flag) { - ; CHECK-LABEL: test_b128_input_from_select - ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}}; - ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0; - ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; - ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; } +; CHECK-LABEL: test_b128_input_from_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_select_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u8 %rs1, [%r2]; +; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; +; CHECK-NEXT: selp.b64 %rd2, 24, 42, %p1; +; CHECK-NEXT: mov.u64 %rd3, 0; +; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3}; +; CHECK-NEXT: mov.u32 %r3, value; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { st.b128 [%rd1], %rq1; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; %1 = addrspacecast ptr %flag to ptr addrspace(1) %tmp1 = load i8, ptr addrspace(1) %1, align 1 @@ -48,12 +90,23 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) { ret void } -; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) define void @test_store_b128_output() { - ; CHECK-LABEL: test_store_b128_output - ; CHECK: { mov.b128 [[REG_128:%rq[0-9]+]], 41; } - ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]]; - +; CHECK-LABEL: test_store_b128_output( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b128 %rq<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { mov.b128 %rq1, 41; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd1, %rd2}, %rq1; +; CHECK-NEXT: add.cc.s64 %rd3, %rd1, 1; +; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, 0; +; CHECK-NEXT: st.global.u64 [value+8], %rd4; +; CHECK-NEXT: st.global.u64 [value], %rd3; +; CHECK-NEXT: ret; + %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() %add = add nsw i128 %1, 1 %2 = bitcast i128 %add to <2 x i64> @@ -61,14 +114,28 @@ define void @test_store_b128_output() { ret void } -; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) define void @test_use_of_b128_output(ptr nocapture readonly %data) { - ; CHECK-LABEL: test_use_of_b128_output - ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8]; - ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]]; - ; CHECK: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; - ; CHECK: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; } - ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]]; +; CHECK-LABEL: test_use_of_b128_output( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_use_of_b128_output_param_0]; +; CHECK-NEXT: cvta.to.global.u32 %r2, %r1; +; CHECK-NEXT: ld.global.u64 %rd1, [%r2+8]; +; CHECK-NEXT: ld.global.u64 %rd2, [%r2]; +; CHECK-NEXT: mov.b128 %rq2, {%rd2, %rd1}; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { mov.b128 %rq1, %rq2; } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd3, %rd4}, %rq1; +; CHECK-NEXT: add.cc.s64 %rd5, %rd3, 1; +; CHECK-NEXT: addc.cc.s64 %rd6, %rd4, 0; +; CHECK-NEXT: st.global.u64 [value], %rd5; +; CHECK-NEXT: st.global.u64 [value+8], %rd6; +; CHECK-NEXT: ret; %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 09b648d036a4c..bb45ff6ba2e27 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} @@ -13,33 +14,110 @@ target triple = "nvptx64-nvidia-cuda" @v_i128_min = internal addrspace(1) global i128 0, align 16 @v64 = internal addrspace(1) global ptr null, align 8 -; Function Attrs: alwaysinline convergent mustprogress willreturn define void @test_corner_values() { - ; CHECK-LABEL: test_corner_values - ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1; - ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]}; - ; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807; - ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I64_MAX]]} - ; CHECK: mov.u64 [[I64_MIN:%rd[0-9]+]], -9223372036854775808; - ; CHECK: mov.u64 [[U64_ZERO:%rd[0-9]+]], 0; - ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[U64_ZERO]], [[I64_MIN]]} - ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[U64_ZERO]], [[U64_ZERO]]} +; CHECK-LABEL: test_corner_values( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b64 %rd<17>; +; CHECK-NEXT: .reg .b128 %rq<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.u32 %r1, [v64]; +; CHECK-NEXT: add.s32 %r2, %r1, 8; +; CHECK-NEXT: mov.u64 %rd13, -1; +; CHECK-NEXT: mov.b128 %rq1, {%rd13, %rd13}; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r2; +; CHECK-NEXT: mov.u32 %r3, v_u128_max; +; CHECK-NEXT: cvta.global.u32 %r4, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd3, %r4; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; +; CHECK-NEXT: st.b64 [%rd1], lo; +; CHECK-NEXT: st.b64 [%rd2], hi; +; CHECK-NEXT: st.b128 [%rd3], %rq1; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r5, [v64]; +; CHECK-NEXT: add.s32 %r6, %r5, 16; +; CHECK-NEXT: add.s32 %r7, %r5, 24; +; CHECK-NEXT: mov.u64 %rd14, 9223372036854775807; +; CHECK-NEXT: mov.b128 %rq2, {%rd13, %rd14}; +; CHECK-NEXT: mov.u32 %r8, v_i128_max; +; CHECK-NEXT: cvta.global.u32 %r9, %r8; +; CHECK-NEXT: cvt.u64.u32 %rd6, %r9; +; CHECK-NEXT: cvt.u64.u32 %rd4, %r6; +; CHECK-NEXT: cvt.u64.u32 %rd5, %r7; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq2; +; CHECK-NEXT: st.b64 [%rd4], lo; +; CHECK-NEXT: st.b64 [%rd5], hi; +; CHECK-NEXT: st.b128 [%rd6], %rq2; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r10, [v64]; +; CHECK-NEXT: add.s32 %r11, %r10, 32; +; CHECK-NEXT: add.s32 %r12, %r10, 40; +; CHECK-NEXT: mov.u64 %rd15, -9223372036854775808; +; CHECK-NEXT: mov.u64 %rd16, 0; +; CHECK-NEXT: mov.b128 %rq3, {%rd16, %rd15}; +; CHECK-NEXT: mov.u32 %r13, v_i128_min; +; CHECK-NEXT: cvta.global.u32 %r14, %r13; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r14; +; CHECK-NEXT: cvt.u64.u32 %rd7, %r11; +; CHECK-NEXT: cvt.u64.u32 %rd8, %r12; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq3; +; CHECK-NEXT: st.b64 [%rd7], lo; +; CHECK-NEXT: st.b64 [%rd8], hi; +; CHECK-NEXT: st.b128 [%rd9], %rq3; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ld.global.u32 %r15, [v64]; +; CHECK-NEXT: add.s32 %r16, %r15, 48; +; CHECK-NEXT: add.s32 %r17, %r15, 56; +; CHECK-NEXT: mov.b128 %rq4, {%rd16, %rd16}; +; CHECK-NEXT: mov.u32 %r18, v_u128_zero; +; CHECK-NEXT: cvta.global.u32 %r19, %r18; +; CHECK-NEXT: cvt.u64.u32 %rd12, %r19; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r16; +; CHECK-NEXT: cvt.u64.u32 %rd11, %r17; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq4; +; CHECK-NEXT: st.b64 [%rd10], lo; +; CHECK-NEXT: st.b64 [%rd11], hi; +; CHECK-NEXT: st.b128 [%rd12], %rq4; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: ret; %tmp = load ptr, ptr addrspace(1) @v64, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) %tmp3 = load ptr, ptr addrspace(1) @v64, align 8 %add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2 %add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) %tmp7 = load ptr, ptr addrspace(1) @v64, align 8 %add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4 %add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) %tmp11 = load ptr, ptr addrspace(1) @v64, align 8 %add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6 %add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) ret void } diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index e187aa4370858..081956447345c 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 ; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %} @@ -7,13 +8,40 @@ target triple = "nvptx64-nvidia-cuda" @x = internal addrspace(1) global i128 0, align 16 define void @test_b128_in_loop() { - ; CHECK-LABEL: test_b128_in_loop - ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [x+8]; - ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [x]; - ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]}; - ; CHECK: mov.b128 {lo, hi}, [[REG_128]]; - ; CHECK: add.cc.u64 lo, lo, {{%rd[0-9]+}}; - ; CHECK: mov.b128 [[REG_128]], {lo, hi}; +; CHECK-LABEL: test_b128_in_loop( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b64 %rd<15>; +; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.s32 %rd1, [size]; +; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; +; CHECK-NEXT: @%p1 bra $L__BB0_3; +; CHECK-NEXT: // %bb.1: // %.lr.ph.preheader +; CHECK-NEXT: ld.global.u64 %rd13, [x+8]; +; CHECK-NEXT: ld.global.u64 %rd12, [x]; +; CHECK-NEXT: mov.u64 %rd14, 0; +; CHECK-NEXT: $L__BB0_2: // %.lr.ph +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; +; CHECK-NEXT: // begin inline asm +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b64 lo; +; CHECK-NEXT: .reg .b64 hi; +; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; +; CHECK-NEXT: add.cc.u64 lo, lo, %rd14; +; CHECK-NEXT: mov.b128 %rq1, {lo, hi}; +; CHECK-NEXT: } +; CHECK-NEXT: // end inline asm +; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1; +; CHECK-NEXT: st.global.u64 [x+8], %rd13; +; CHECK-NEXT: st.global.u64 [x], %rd12; +; CHECK-NEXT: add.s64 %rd14, %rd14, 1; +; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14; +; CHECK-NEXT: @%p2 bra $L__BB0_2; +; CHECK-NEXT: $L__BB0_3: // %._crit_edge +; CHECK-NEXT: ret; %tmp11 = load i32, ptr addrspace(1) @size, align 4 %cmp3.not = icmp eq i32 %tmp11, 0 @@ -27,7 +55,7 @@ define void @test_b128_in_loop() { .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader %1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ] %i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ] - %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09} \0A\09", "=q,l,0"(i64 %i.04, i128 %1) + %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %i.04, i128 %1) %3 = bitcast i128 %2 to <2 x i64> store <2 x i64> %3, ptr addrspace(1) @x, align 16 %inc = add nuw i64 %i.04, 1 From cb43464197bc432980814df939232e8e78aa3cb0 Mon Sep 17 00:00:00 2001 From: chengjunp Date: Wed, 26 Jun 2024 22:25:04 +0000 Subject: [PATCH 5/6] Rename variables in tests & Use dag helper to build nodes --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 +- .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 13 ++---- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 36 +++++++-------- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 45 +++++++++---------- 4 files changed, 41 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 0bfcba602cb51..26c16ee9fd18f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3111,14 +3111,14 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, SDNode *Node = Op.getNode(); SDLoc DL(Node); - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2)); + SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2)); SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, DAG.getIntPtrConstant(0, DL)); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, DAG.getIntPtrConstant(1, DL)); SmallVector NewOps(Op->getNumOperands() + 1); - SmallVector ResultsType(Node->value_begin(), Node->value_end()); + SmallVector ResultsType(Node->values()); NewOps[0] = Op->getOperand(0); // Chain NewOps[1] = Op->getOperand(1); // Dst Reg diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index a04ed40dbf91a..3232f40a40a70 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -83,10 +83,10 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) { ; CHECK-NEXT: ret; %1 = addrspacecast ptr %flag to ptr addrspace(1) - %tmp1 = load i8, ptr addrspace(1) %1, align 1 - %tobool.not = icmp eq i8 %tmp1, 0 - %. = select i1 %tobool.not, i128 24, i128 42 - tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) + %2 = load i8, ptr addrspace(1) %1, align 1 + %3 = icmp eq i8 %2, 0 + %4 = select i1 %3, i128 24, i128 42 + tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %4) ret void } @@ -146,8 +146,3 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) { store <2 x i64> %5, ptr addrspace(1) @value, align 16 ret void } - -!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} - -!0 = !{i32 2, i32 0, i32 3, i32 1} -!1 = !{i32 2, i32 0} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index bb45ff6ba2e27..3d1d7fbbe27e8 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -103,26 +103,20 @@ define void @test_corner_values() { ; CHECK-NEXT: // end inline asm ; CHECK-NEXT: ret; - %tmp = load ptr, ptr addrspace(1) @v64, align 8 - %add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) - %tmp3 = load ptr, ptr addrspace(1) @v64, align 8 - %add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2 - %add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) - %tmp7 = load ptr, ptr addrspace(1) @v64, align 8 - %add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4 - %add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) - %tmp11 = load ptr, ptr addrspace(1) @v64, align 8 - %add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6 - %add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7 - tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) + %1 = load ptr, ptr addrspace(1) @v64, align 8 + %2 = getelementptr inbounds i64, ptr %1, i64 1 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %1, ptr nonnull %2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr)) + %3 = load ptr, ptr addrspace(1) @v64, align 8 + %4 = getelementptr inbounds i64, ptr %3, i64 2 + %5 = getelementptr inbounds i64, ptr %3, i64 3 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %4, ptr nonnull %5, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr)) + %6 = load ptr, ptr addrspace(1) @v64, align 8 + %7 = getelementptr inbounds i64, ptr %6, i64 4 + %8 = getelementptr inbounds i64, ptr %6, i64 5 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %7, ptr nonnull %8, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr)) + %9 = load ptr, ptr addrspace(1) @v64, align 8 + %10 = getelementptr inbounds i64, ptr %9, i64 6 + %11 = getelementptr inbounds i64, ptr %9, i64 7 + tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %10, ptr nonnull %11, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr)) ret void } - - -!nvvmir.version = !{!2, !3, !2, !3, !3, !2, !2, !2, !3} - -!2 = !{i32 2, i32 0, i32 3, i32 1} -!3 = !{i32 2, i32 0} diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 081956447345c..ae453977123e0 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -18,11 +18,11 @@ define void @test_b128_in_loop() { ; CHECK-NEXT: ld.global.s32 %rd1, [size]; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; ; CHECK-NEXT: @%p1 bra $L__BB0_3; -; CHECK-NEXT: // %bb.1: // %.lr.ph.preheader +; CHECK-NEXT: // %bb.1: // %BB1 ; CHECK-NEXT: ld.global.u64 %rd13, [x+8]; ; CHECK-NEXT: ld.global.u64 %rd12, [x]; ; CHECK-NEXT: mov.u64 %rd14, 0; -; CHECK-NEXT: $L__BB0_2: // %.lr.ph +; CHECK-NEXT: $L__BB0_2: // %BB2 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; ; CHECK-NEXT: // begin inline asm @@ -40,33 +40,28 @@ define void @test_b128_in_loop() { ; CHECK-NEXT: add.s64 %rd14, %rd14, 1; ; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14; ; CHECK-NEXT: @%p2 bra $L__BB0_2; -; CHECK-NEXT: $L__BB0_3: // %._crit_edge +; CHECK-NEXT: $L__BB0_3: // %BB3 ; CHECK-NEXT: ret; - %tmp11 = load i32, ptr addrspace(1) @size, align 4 - %cmp3.not = icmp eq i32 %tmp11, 0 - br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader + %1 = load i32, ptr addrspace(1) @size, align 4 + %2 = icmp eq i32 %1, 0 + br i1 %2, label %BB3, label %BB1 -.lr.ph.preheader: ; preds = %0 - %x.promoted5 = load i128, ptr addrspace(1) @x, align 16 - %umax = sext i32 %tmp11 to i64 - br label %.lr.ph +BB1: ; preds = %0 + %3 = load i128, ptr addrspace(1) @x, align 16 + %4 = sext i32 %1 to i64 + br label %BB2 -.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader - %1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ] - %i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ] - %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %i.04, i128 %1) - %3 = bitcast i128 %2 to <2 x i64> - store <2 x i64> %3, ptr addrspace(1) @x, align 16 - %inc = add nuw i64 %i.04, 1 - %exitcond.not = icmp eq i64 %inc, %umax - br i1 %exitcond.not, label %._crit_edge, label %.lr.ph +BB2: ; preds = %BB2, %BB1 + %5 = phi i128 [ %7, %BB2 ], [ %3, %BB1 ] + %6 = phi i64 [ %9, %BB2 ], [ 0, %BB1 ] + %7 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %6, i128 %5) + %8 = bitcast i128 %7 to <2 x i64> + store <2 x i64> %8, ptr addrspace(1) @x, align 16 + %9 = add nuw i64 %6, 1 + %10 = icmp eq i64 %9, %4 + br i1 %10, label %BB3, label %BB2 -._crit_edge: ; preds = %.lr.ph, %0 +BB3: ; preds = %BB2, %0 ret void } - -!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1} - -!0 = !{i32 2, i32 0, i32 3, i32 1} -!1 = !{i32 2, i32 0} From a047ab23c4a2fa19e597eb495342a2936f99e7ef Mon Sep 17 00:00:00 2001 From: chengjunp Date: Fri, 28 Jun 2024 22:05:15 +0000 Subject: [PATCH 6/6] Update LangRef for the new constraint code for 128-bit values in NVPTX. --- llvm/docs/LangRef.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 211fee5f008a0..e2c47204e628f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -5381,6 +5381,7 @@ NVPTX: - ``c`` or ``h``: A 16-bit integer register. - ``r``: A 32-bit integer register. - ``l`` or ``N``: A 64-bit integer register. +- ``q``: A 128-bit integer register. - ``f``: A 32-bit float register. - ``d``: A 64-bit float register.