From 9b965ba72db4bb354fc548657e571ae26001af94 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Mon, 15 Jul 2024 12:23:44 -0700 Subject: [PATCH 01/22] [NVPTX] Support fence instruction --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 91b239a52d17f..8f8ecac933b4d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3930,7 +3930,6 @@ def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, / def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1) Requires<[hasPTX<60>, hasSM<70>]>; - // If PTX<60 or SM<70, we fall back to MEMBAR: def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1) def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1) From 66161444eef611a917d10a34d5d639662bca852a Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 9 Jul 2024 18:17:44 +0200 Subject: [PATCH 02/22] [NVPTX] Volta SequentiallyConsistent Load/Store Ops --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 ++ llvm/lib/Target/NVPTX/NVPTX.h | 3 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 99 ++++++++---- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +++++++++--------- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 55 +++++++ 5 files changed, 210 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index a004d64c21cc6..addb326e156f0 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -254,6 +254,16 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, report_fatal_error(OS.str()); break; } + } else if (!strcmp(Modifier, "sc")) { + switch (Imm) { + // TODO: refactor fence insertion in ISelDagToDag instead of here + // as part of implementing atomicrmw seq_cst. + case NVPTX::PTXLdStInstCode::SeqCstFence: + O << "fence.sc.sys;\n\t"; + break; + default: + break; + } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { case NVPTX::PTXLdStInstCode::GLOBAL: diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 3c7167b157025..d6a4471e2bc33 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -113,7 +113,8 @@ enum MemorySemantic { Relaxed = 2, Acquire = 3, Release = 4, - RelaxedMMIO = 5 + RelaxedMMIO = 5, + SeqCstFence = 6, }; enum AddressSpace { GENERIC = 0, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 96456ad0547ea..4077aa4f78ae9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } -static unsigned int getCodeMemorySemantic(MemSDNode *N, - const NVPTXSubtarget *Subtarget) { +struct MemorySemantic { + unsigned int sem = -1; + unsigned int sc_fence = -1; + MemorySemantic(unsigned int s) : sem(s) {} + MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {} +}; + +static MemorySemantic getCodeMemorySemantic(MemSDNode *N, + const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); auto CodeAddrSpace = getCodeAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); - // TODO: lowering for SequentiallyConsistent Operations: for now, we error. - // TODO: lowering for AcquireRelease Operations: for now, we error. - // - // clang-format off - // Lowering for non-SequentiallyConsistent Operations + // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error). // // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | // |---------|----------|--------------------|------------|------------------------------| @@ -749,6 +752,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // | Other | Yes | Generic, Shared, | Error [2] | [3] | // | | | / Global [0] | | | + // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX + // by following the ABI proven sound in: + // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19. + // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 + // + // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence | + // |-----------------------------------------------------------------------------|-----------------------------------------| + // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; | + // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; ld.acquire.; | + // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; st.release.; | + // | cuda::atomic_fetch_(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; atom.acq_rel.; | + // clang-format on // [0]: volatile and atomics are only supported on global or shared @@ -788,7 +803,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, // - the "weak" memory instruction we are currently lowering to, and // - some other instruction that preserves the side-effect, e.g., // a dead dummy volatile load. - if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { @@ -870,16 +884,32 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - case AtomicOrdering::SequentiallyConsistent: - // TODO: support AcquireRelease and SequentiallyConsistent - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX backend does not support AtomicOrdering \"" - << toIRString(Ordering) << "\" yet."; - report_fatal_error(OS.str()); + case AtomicOrdering::SequentiallyConsistent: { + unsigned int sem; + if (N->readMem()) { + sem = NVPTX::PTXLdStInstCode::Acquire; + } else if (N->writeMem()) { + sem = NVPTX::PTXLdStInstCode::Release; + } else { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX does not support SequentiallyConsistent Ordering on " + "read-modify-writes yet: " + << N->getOperationName(); + N->print(OS); + report_fatal_error(OS.str()); + } + return addrGenericOrGlobalOrShared + ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence) + : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic); + } } - llvm_unreachable("unexpected unhandled case"); + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "NVPTX backend does not support AtomicOrdering \"" + << toIRString(Ordering) << "\" yet."; + report_fatal_error(OS.str()); } static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, @@ -1091,7 +1121,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { } // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget); + auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); @@ -1136,7 +1166,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(SeqCstFence, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1151,7 +1182,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(SeqCstFence, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1173,7 +1205,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(SeqCstFence, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1194,7 +1227,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(SeqCstFence, dl), + getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1238,7 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1305,7 +1339,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(SeqCstFence, DL), + getI32Imm(CodeMemorySem, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1334,7 +1369,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(SeqCstFence, DL), + getI32Imm(CodeMemorySem, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1384,7 +1420,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(SeqCstFence, DL), + getI32Imm(CodeMemorySem, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1434,7 +1471,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(SeqCstFence, DL), + getI32Imm(CodeMemorySem, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1889,7 +1927,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget); + auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -1926,6 +1964,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, + getI32Imm(SeqCstFence, dl), getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -1943,6 +1982,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, + getI32Imm(SeqCstFence, dl), getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -1968,6 +2008,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = {Value, + getI32Imm(SeqCstFence, dl), getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -1990,6 +2031,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, + getI32Imm(SeqCstFence, dl), getI32Imm(CodeMemorySem, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2030,7 +2072,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); // Memory Semantic Setting - unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget); + auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -2072,6 +2114,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } + StOps.push_back(getI32Imm(SeqCstFence, DL)); StOps.push_back(getI32Imm(CodeMemorySem, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 8f8ecac933b4d..77375700e865f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; } @@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; } @@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 68915b0f2698b..15dd4d6087cb3 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -247,6 +247,59 @@ define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ret void } +; CHECK-LABEL: generic_sc +define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e seq_cst, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e seq_cst, align 8 + + ret void +} + ; CHECK-LABEL: generic_monotonic_volatile define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] @@ -1277,3 +1330,5 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ret void } + + From c2711d1e469b35811c3e53298029257a21eb682e Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 11 Jul 2024 14:42:32 +0200 Subject: [PATCH 03/22] [NVPTX] Align Memory Ordering enum with LLVM --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 24 ++-- llvm/lib/Target/NVPTX/NVPTX.h | 53 +++++-- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 131 ++++++++++-------- 3 files changed, 133 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index addb326e156f0..d7a3daa450f0f 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -228,37 +228,41 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); if (!strcmp(Modifier, "sem")) { - switch (Imm) { - case NVPTX::PTXLdStInstCode::NotAtomic: + auto ordering = + NVPTX::Ordering(static_cast(Imm)); + switch (ordering) { + case NVPTX::Ordering::NotAtomic: break; - case NVPTX::PTXLdStInstCode::Volatile: + case NVPTX::Ordering::Volatile: O << ".volatile"; break; - case NVPTX::PTXLdStInstCode::Relaxed: + case NVPTX::Ordering::Relaxed: O << ".relaxed.sys"; break; - case NVPTX::PTXLdStInstCode::Acquire: + case NVPTX::Ordering::Acquire: O << ".acquire.sys"; break; - case NVPTX::PTXLdStInstCode::Release: + case NVPTX::Ordering::Release: O << ".release.sys"; break; - case NVPTX::PTXLdStInstCode::RelaxedMMIO: + case NVPTX::Ordering::RelaxedMMIO: O << ".mmio.relaxed.sys"; break; default: SmallString<256> Msg; raw_svector_ostream OS(Msg); - OS << "NVPTX LdStCode Printer does not support \"" << Imm + OS << "NVPTX LdStCode Printer does not support \"" << ordering << "\" sem modifier."; report_fatal_error(OS.str()); break; } } else if (!strcmp(Modifier, "sc")) { - switch (Imm) { + auto ordering = + NVPTX::Ordering(static_cast(Imm)); + switch (ordering) { // TODO: refactor fence insertion in ISelDagToDag instead of here // as part of implementing atomicrmw seq_cst. - case NVPTX::PTXLdStInstCode::SeqCstFence: + case NVPTX::Ordering::SequentiallyConsistent: O << "fence.sc.sys;\n\t"; break; default: diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index d6a4471e2bc33..602ab6e150e2a 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -106,16 +106,53 @@ enum LoadStore { isStoreShift = 6 }; -namespace PTXLdStInstCode { -enum MemorySemantic { +// Extends LLVM AtomicOrdering with PTX Orderings: +using OrderingUnderlyingType = unsigned int; +enum class Ordering : OrderingUnderlyingType { NotAtomic = 0, // PTX calls these: "Weak" - Volatile = 1, + // Unordered = 1, // TODO: NVPTX should map this to "Relaxed" Relaxed = 2, - Acquire = 3, - Release = 4, - RelaxedMMIO = 5, - SeqCstFence = 6, + // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" + Acquire = 4, + Release = 5, + // AcquireRelease = 6, // TODO + SequentiallyConsistent = 7, + Volatile = 8, + RelaxedMMIO = 9, + LAST = RelaxedMMIO }; + +template OStream &operator<<(OStream &os, Ordering order) { + switch (order) { + case Ordering::NotAtomic: + os << "NotAtomic"; + return os; + case Ordering::Relaxed: + os << "Relaxed"; + return os; + case Ordering::Acquire: + os << "Acquire"; + return os; + case Ordering::Release: + os << "Release"; + return os; + // case Ordering::AcquireRelease: + // os << "AcquireRelease"; + // return os; + case Ordering::SequentiallyConsistent: + os << "SequentiallyConsistent"; + return os; + case Ordering::Volatile: + os << "Volatile"; + return os; + case Ordering::RelaxedMMIO: + os << "RelaxedMMIO"; + return os; + } + report_fatal_error("unknown ordering"); +} + +namespace PTXLdStInstCode { enum AddressSpace { GENERIC = 0, GLOBAL = 1, @@ -135,7 +172,7 @@ enum VecType { V2 = 2, V4 = 4 }; -} +} // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration namespace PTXCvtMode { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4077aa4f78ae9..024fc4e0056e6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -714,15 +714,17 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } -struct MemorySemantic { - unsigned int sem = -1; - unsigned int sc_fence = -1; - MemorySemantic(unsigned int s) : sem(s) {} - MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {} +struct OperationOrderings { + NVPTX::OrderingUnderlyingType instr_ordering; + NVPTX::OrderingUnderlyingType fence_ordering; + OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic, + NVPTX::Ordering f = NVPTX::Ordering::NotAtomic) + : instr_ordering(static_cast(o)), + fence_ordering(static_cast(f)) {} }; -static MemorySemantic getCodeMemorySemantic(MemSDNode *N, - const NVPTXSubtarget *Subtarget) { +static OperationOrderings +getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); auto CodeAddrSpace = getCodeAddrSpace(N); @@ -806,7 +808,7 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N, if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT || CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) { - return NVPTX::PTXLdStInstCode::NotAtomic; + return NVPTX::Ordering::NotAtomic; } // [2]: Atomics with Ordering different than Unordered or Relaxed are not @@ -840,20 +842,22 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N, switch (Ordering) { case AtomicOrdering::NotAtomic: return N->isVolatile() && AddrGenericOrGlobalOrShared - ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Unordered: // We lower unordered in the exact same way as 'monotonic' to respect // LLVM IR atomicity requirements. case AtomicOrdering::Monotonic: if (N->isVolatile()) - return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO + : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; else - return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed - : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile - : NVPTX::PTXLdStInstCode::NotAtomic; + return HasMemoryOrdering ? NVPTX::Ordering::Relaxed + : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; + // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to + // Acquire. case AtomicOrdering::Acquire: if (!N->readMem()) { SmallString<256> Msg; @@ -863,8 +867,8 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire - : NVPTX::PTXLdStInstCode::NotAtomic; + return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Release: if (!N->writeMem()) { SmallString<256> Msg; @@ -874,22 +878,30 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release - : NVPTX::PTXLdStInstCode::NotAtomic; + return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::AcquireRelease: { SmallString<256> Msg; raw_svector_ostream OS(Msg); - OS << "PTX only supports AcquireRelease Ordering on read-modify-write: " + OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write " + "yet and PTX does not support it on loads or stores: " << N->getOperationName(); N->print(OS); report_fatal_error(OS.str()); } case AtomicOrdering::SequentiallyConsistent: { - unsigned int sem; + // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX + // sequence including a "fence.sc.sco" and the memory instruction with an + // Ordering that differs from "sc": acq, rel, or acq_rel, depending on + // whether the memory operation is a read, write, or read-modify-write. + // + // This sets the ordering of the fence to SequentiallyConsistent, and + // sets the corresponding ordering for the instruction. + NVPTX::Ordering ord; if (N->readMem()) { - sem = NVPTX::PTXLdStInstCode::Acquire; + ord = NVPTX::Ordering::Acquire; } else if (N->writeMem()) { - sem = NVPTX::PTXLdStInstCode::Release; + ord = NVPTX::Ordering::Release; } else { SmallString<256> Msg; raw_svector_ostream OS(Msg); @@ -899,9 +911,10 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N, N->print(OS); report_fatal_error(OS.str()); } - return addrGenericOrGlobalOrShared - ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence) - : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic); + return AddrGenericOrGlobalOrShared + ? OperationOrderings(ord, + NVPTX::Ordering::SequentiallyConsistent) + : OperationOrderings(NVPTX::Ordering::NotAtomic); } } @@ -1121,7 +1134,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { } // Memory Semantic Setting - auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget); + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(LD, Subtarget); unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); @@ -1166,8 +1180,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1182,8 +1196,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1205,8 +1219,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1227,8 +1241,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + SDValue Ops[] = {getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1272,7 +1286,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); // Memory Semantic Setting - auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget); + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(MemSD, Subtarget); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1339,8 +1354,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, DL), - getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(FenceOrdering, DL), + getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1369,8 +1384,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, DL), - getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(FenceOrdering, DL), + getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1420,8 +1435,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, DL), - getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(FenceOrdering, DL), + getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1471,8 +1486,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(SeqCstFence, DL), - getI32Imm(CodeMemorySem, DL), + SDValue Ops[] = {getI32Imm(FenceOrdering, DL), + getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1927,7 +1942,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); // Memory Semantic Setting - auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget); + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(ST, Subtarget); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -1964,8 +1980,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -1982,8 +1998,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -2008,8 +2024,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = {Value, - getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -2031,8 +2047,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(SeqCstFence, dl), - getI32Imm(CodeMemorySem, dl), + getI32Imm(FenceOrdering, dl), + getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(toType, dl), @@ -2072,7 +2088,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); // Memory Semantic Setting - auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget); + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(MemSD, Subtarget); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' @@ -2114,8 +2131,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(SeqCstFence, DL)); - StOps.push_back(getI32Imm(CodeMemorySem, DL)); + StOps.push_back(getI32Imm(FenceOrdering, DL)); + StOps.push_back(getI32Imm(InstructionOrdering, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); StOps.push_back(getI32Imm(ToType, DL)); From 8299d830b55ed68e936096d48cf0b8bc5886d2de Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 11 Jul 2024 18:50:37 +0200 Subject: [PATCH 04/22] [NVPTX]: Fix typos --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 +++--- llvm/lib/Target/NVPTX/NVPTX.h | 36 +++++++++---------- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 ++++++++++-------- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 2 -- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index d7a3daa450f0f..45561d5a11238 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -228,9 +228,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); if (!strcmp(Modifier, "sem")) { - auto ordering = + auto Ordering = NVPTX::Ordering(static_cast(Imm)); - switch (ordering) { + switch (Ordering) { case NVPTX::Ordering::NotAtomic: break; case NVPTX::Ordering::Volatile: @@ -251,15 +251,15 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, default: SmallString<256> Msg; raw_svector_ostream OS(Msg); - OS << "NVPTX LdStCode Printer does not support \"" << ordering + OS << "NVPTX LdStCode Printer does not support \"" << Ordering << "\" sem modifier."; report_fatal_error(OS.str()); break; } } else if (!strcmp(Modifier, "sc")) { - auto ordering = + auto Ordering = NVPTX::Ordering(static_cast(Imm)); - switch (ordering) { + switch (Ordering) { // TODO: refactor fence insertion in ISelDagToDag instead of here // as part of implementing atomicrmw seq_cst. case NVPTX::Ordering::SequentiallyConsistent: diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 602ab6e150e2a..c9cce23788ca4 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -122,32 +122,32 @@ enum class Ordering : OrderingUnderlyingType { LAST = RelaxedMMIO }; -template OStream &operator<<(OStream &os, Ordering order) { - switch (order) { +template OStream &operator<<(OStream &O, Ordering Order) { + switch (Order) { case Ordering::NotAtomic: - os << "NotAtomic"; - return os; + O << "NotAtomic"; + return O; case Ordering::Relaxed: - os << "Relaxed"; - return os; + O << "Relaxed"; + return O; case Ordering::Acquire: - os << "Acquire"; - return os; + O << "Acquire"; + return O; case Ordering::Release: - os << "Release"; - return os; + O << "Release"; + return O; // case Ordering::AcquireRelease: - // os << "AcquireRelease"; - // return os; + // O << "AcquireRelease"; + // return O; case Ordering::SequentiallyConsistent: - os << "SequentiallyConsistent"; - return os; + O << "SequentiallyConsistent"; + return O; case Ordering::Volatile: - os << "Volatile"; - return os; + O << "Volatile"; + return O; case Ordering::RelaxedMMIO: - os << "RelaxedMMIO"; - return os; + O << "RelaxedMMIO"; + return O; } report_fatal_error("unknown ordering"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 024fc4e0056e6..0e182dcad7510 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -715,12 +715,12 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { } struct OperationOrderings { - NVPTX::OrderingUnderlyingType instr_ordering; - NVPTX::OrderingUnderlyingType fence_ordering; + NVPTX::OrderingUnderlyingType InstrOrdering; + NVPTX::OrderingUnderlyingType FenceOrdering; OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic, NVPTX::Ordering f = NVPTX::Ordering::NotAtomic) - : instr_ordering(static_cast(o)), - fence_ordering(static_cast(f)) {} + : InstrOrdering(static_cast(o)), + FenceOrdering(static_cast(f)) {} }; static OperationOrderings @@ -759,12 +759,19 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19. // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 // - // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence | - // |-----------------------------------------------------------------------------|-----------------------------------------| - // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; | - // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; ld.acquire.; | - // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; st.release.; | - // | cuda::atomic_fetch_(memory_order_seq_cst, cuda::thread_scope_) | fence.sc.; atom.acq_rel.; | + // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_thread_fence | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_load | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | ld.acquire.; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_store | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | st.release.; | + // |------------------------------------------------------|-------------------------------| + // | cuda::atomic_fetch_ | fence.sc.; | + // | (memory_order_seq_cst, cuda::thread_scope_) | atom.acq_rel.; | // clang-format on @@ -897,11 +904,11 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // // This sets the ordering of the fence to SequentiallyConsistent, and // sets the corresponding ordering for the instruction. - NVPTX::Ordering ord; + NVPTX::Ordering InstrOrder; if (N->readMem()) { - ord = NVPTX::Ordering::Acquire; + InstrOrder = NVPTX::Ordering::Acquire; } else if (N->writeMem()) { - ord = NVPTX::Ordering::Release; + InstrOrder = NVPTX::Ordering::Release; } else { SmallString<256> Msg; raw_svector_ostream OS(Msg); @@ -912,7 +919,7 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { report_fatal_error(OS.str()); } return AddrGenericOrGlobalOrShared - ? OperationOrderings(ord, + ? OperationOrderings(InstrOrder, NVPTX::Ordering::SequentiallyConsistent) : OperationOrderings(NVPTX::Ordering::NotAtomic); } diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 15dd4d6087cb3..9fbf6970935d9 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1330,5 +1330,3 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ret void } - - From 8066f50265e6e013d6b15345b82e695ae2560c52 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 16 Jul 2024 01:00:37 -0700 Subject: [PATCH 05/22] [NVPTX] Cleanup SeqCst Load/Store --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 109 +++++++++++---- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 ++++++++++---------- 2 files changed, 158 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0e182dcad7510..0addd3dc9aaf0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1147,6 +1147,26 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); + // If a fence is required before the operation, insert it: + SDValue Chain = N->getOperand(0); + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); + break; + } + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) + << "\"."; + report_fatal_error(OS.str()); + } + // Type Setting: fromType + fromTypeWidth // // Sign : ISD::SEXTLOAD @@ -1174,7 +1194,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { fromType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue Addr; SDValue Offset, Base; @@ -1187,8 +1206,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, dl), - getI32Imm(InstructionOrdering, dl), + SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1203,8 +1221,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, dl), - getI32Imm(InstructionOrdering, dl), + SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1226,8 +1243,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, dl), - getI32Imm(InstructionOrdering, dl), + SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1248,8 +1264,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, dl), - getI32Imm(InstructionOrdering, dl), + SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), getI32Imm(fromType, dl), @@ -1296,6 +1311,25 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { auto [InstructionOrdering, FenceOrdering] = getOperationOrderings(MemSD, Subtarget); + // If a fence is required before the operation, insert it: + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); + break; + } + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) + << "\"."; + report_fatal_error(OS.str()); + } + // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1361,8 +1395,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, DL), - getI32Imm(InstructionOrdering, DL), + SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1391,8 +1424,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, DL), - getI32Imm(InstructionOrdering, DL), + SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1442,8 +1474,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, DL), - getI32Imm(InstructionOrdering, DL), + SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1493,8 +1524,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(FenceOrdering, DL), - getI32Imm(InstructionOrdering, DL), + SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), getI32Imm(FromType, DL), @@ -1952,6 +1982,26 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { auto [InstructionOrdering, FenceOrdering] = getOperationOrderings(ST, Subtarget); + // If a fence is required before the operation, insert it: + SDValue Chain = ST->getChain(); + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); + break; + } + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) + << "\"."; + report_fatal_error(OS.str()); + } + // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; @@ -1971,7 +2021,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { unsigned int toType = getLdStRegType(ScalarVT); // Create the machine instruction DAG - SDValue Chain = ST->getChain(); SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); SDValue BasePtr = ST->getBasePtr(); SDValue Addr; @@ -1987,7 +2036,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(FenceOrdering, dl), getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2005,7 +2053,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(FenceOrdering, dl), getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2031,7 +2078,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; SDValue Ops[] = {Value, - getI32Imm(FenceOrdering, dl), getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2054,7 +2100,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { if (!Opcode) return false; SDValue Ops[] = {Value, - getI32Imm(FenceOrdering, dl), getI32Imm(InstructionOrdering, dl), getI32Imm(CodeAddrSpace, dl), getI32Imm(vecType, dl), @@ -2098,6 +2143,25 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { auto [InstructionOrdering, FenceOrdering] = getOperationOrderings(MemSD, Subtarget); + // If a fence is required before the operation, insert it: + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); + break; + } + default: + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) + << "\"."; + report_fatal_error(OS.str()); + } + // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' assert(StoreVT.isSimple() && "Store value is not simple"); @@ -2138,7 +2202,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(FenceOrdering, DL)); StOps.push_back(getI32Imm(InstructionOrdering, DL)); StOps.push_back(getI32Imm(CodeAddrSpace, DL)); StOps.push_back(getI32Imm(VecType, DL)); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 77375700e865f..8f8ecac933b4d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr+$offset];", []>; } @@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, + (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr+$offset], $src;", []>; } @@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}" "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; } From 3f3fd6d405e2f14245b9ce840888a9b727ea26a9 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 16 Jul 2024 05:14:21 -0700 Subject: [PATCH 06/22] [NVPTX] Add vector load/store tests and refactor load/store tests --- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 1181 +++++-------------- llvm/test/CodeGen/NVPTX/load-store.ll | 861 ++++++++++++-- 2 files changed, 1089 insertions(+), 953 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 9fbf6970935d9..4d3b11094d3a1 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,169 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} -; CHECK-LABEL: generic_plain -define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr %a - - ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr %b - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr %c - - ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr %d - - ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr %c - - ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_volatile -define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr %a - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr %b - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr %c - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr %d - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr %c - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_unordered -define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic -define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 - - ret void -} +;; generic statespace ; CHECK-LABEL: generic_acq_rel define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { @@ -203,46 +41,50 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e release, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } -; CHECK-LABEL: generic_unordered_volatile -define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a unordered, align 1 +; CHECK-LABEL: generic_acq_rel_volatile +define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a unordered, align 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a release, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b release, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c release, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d release, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e acquire, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e release, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e release, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } @@ -297,297 +139,67 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e seq_cst, align 8 - ret void -} - -; CHECK-LABEL: generic_monotonic_volatile -define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 - - ret void -} - -;; global statespace - -; CHECK-LABEL: global_plain -define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_volatile -define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_unordered -define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: global_monotonic -define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 +; CHECK-LABEL: generic_sc_volatile +define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: global_unordered_volatile -define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 - - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e seq_cst, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: global_monotonic_volatile -define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} +;; global statespace ; CHECK-LABEL: global_acq_rel define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { @@ -627,6 +239,8 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e release, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -668,257 +282,123 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 - ret void -} - -;; shared statespace - -; CHECK-LABEL: shared_plain -define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(3) %c - - ret void -} - -; CHECK-LABEL: shared_volatile -define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(3) %c + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: shared_unordered -define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 +; CHECK-LABEL: global_seq_cst +define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: shared_unordered_volatile -define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: shared_monotonic -define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 +; CHECK-LABEL: global_seq_cst_volatile +define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 - - ret void -} - -; CHECK-LABEL: shared_monotonic_volatile -define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } +;; shared statespace + ; CHECK-LABEL: shared_acq_rel define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] @@ -957,6 +437,8 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e release, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -998,335 +480,302 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 - ret void -} - -;; local statespace - -; CHECK-LABEL: local_plain -define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(5) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(5) %a - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(5) %c + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_volatile -define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(5) %a +; CHECK-LABEL: shared_seq_cst +define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(5) %a + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_unordered -define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 +; CHECK-LABEL: shared_seq_cst_volatile +define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_unordered_volatile -define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 +;; local statespace - ret void -} +; CHECK-LABEL: local_acq_rel +define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress -; CHECK-LABEL: local_monotonic -define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(5) %e release, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_acq_rel_volatile +define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_acq_rel -define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst +define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e release, align 4 + store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e release, align 8 + store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } -; CHECK-LABEL: local_acq_rel_volatile -define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst_volatile +define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } + +; TODO: missing .const statespace tests +; TODO: missing .param statespace tests diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 4c5e0920ce1ae..dde5297d0feaa 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,5 +1,7 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} ; generic statespace @@ -36,10 +38,81 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { store float %e.add, ptr %c ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c + %f.load = load double, ptr %d %f.add = fadd double %f.load, 1. ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c + store double %f.add, ptr %d + + ; TODO: should be combined into single .u16 op + ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr %b + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr %c + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr %d ret void } @@ -82,47 +155,126 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr %c + ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr %b + + ; TODO: should NOT be combined into a single .u32 op + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr %c + + ; TODO: should NOT be combined into a single .u32 op + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr %c + + ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr %d + + ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr %d + + ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr %d + + ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr %d + + ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr %d + + ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr %d + ret void } ; CHECK-LABEL: generic_monotonic define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -164,47 +316,63 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } ; CHECK-LABEL: generic_unordered define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -246,6 +414,8 @@ define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -289,6 +459,77 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(1) %c + ; TODO: should be combined into single .u16 op + ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(1) %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(1) %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(1) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(1) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(1) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(1) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(1) %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(1) %d + ret void } @@ -330,129 +571,236 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(1) %c + ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<2 x i8> %h.add, ptr addrspace(1) %b + + ; TODO: should NOT be combined into single .u32 op + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<4 x i8> %i.add, ptr addrspace(1) %c + + ; TODO: should NOT be combined into single .u32 op + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<2 x i16> %j.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<4 x i16> %k.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<2 x i32> %l.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<4 x i32> %m.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile<2 x i64> %n.add, ptr addrspace(1) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<2 x float> %o.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<4 x float> %p.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile<2 x double> %q.add, ptr addrspace(1) %d + ret void } ; CHECK-LABEL: global_monotonic define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } ; CHECK-LABEL: global_monotonic_volatile define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } ; CHECK-LABEL: global_unordered define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -494,6 +842,8 @@ define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -537,6 +887,77 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(3) %c + ; TODO: should be combined into single .u16 op + ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(3) %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(3) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(3) %d + ret void } @@ -578,47 +999,134 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(3) %c + ; TODO: should be combined into single .u16 op + ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(3) %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(3) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(3) %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(3) %d + ret void } ; CHECK-LABEL: shared_monotonic define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: in some cases it may be valid to optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -660,47 +1168,65 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } ; CHECK-LABEL: shared_unordered define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: in some cases it may be valid to optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -742,6 +1268,8 @@ define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -785,11 +1313,84 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(5) %c + ; TODO: should be combined into single .u16 op + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(5) %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(5) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -826,11 +1427,84 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(5) %c + ; TODO: should be combined into single .u16 op + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(5) %c + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(5) %d + + ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors + ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + + ; TODO: should be combined into single .u64 op + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op in sm_70+ + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(5) %d + + ; TODO: should be combined into single .b128 op + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_monotonic define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -867,11 +1541,15 @@ define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } ; CHECK-LABEL: local_monotonic_volatile define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: this codegen looses Concurrent Forward Progress + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -908,6 +1586,8 @@ define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -949,6 +1629,8 @@ define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } @@ -990,5 +1672,10 @@ define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 + ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void } + +; TODO: missing .const statespace tests +; TODO: missing .param statespace tests From 6b6ef4849d7ea49b92df9b0b5ceb2261ee3e8716 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 16 Jul 2024 07:03:27 -0700 Subject: [PATCH 07/22] [NVPTX] Cleanups - Remove dead code in InstPrinter - Capitalization and improve Table comments - Update fence->membar lowering --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 12 ------------ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 45561d5a11238..3a692feb47ed6 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -256,18 +256,6 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, report_fatal_error(OS.str()); break; } - } else if (!strcmp(Modifier, "sc")) { - auto Ordering = - NVPTX::Ordering(static_cast(Imm)); - switch (Ordering) { - // TODO: refactor fence insertion in ISelDagToDag instead of here - // as part of implementing atomicrmw seq_cst. - case NVPTX::Ordering::SequentiallyConsistent: - O << "fence.sc.sys;\n\t"; - break; - default: - break; - } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { case NVPTX::PTXLdStInstCode::GLOBAL: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0addd3dc9aaf0..b1077b3ccacdc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -717,10 +717,10 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { struct OperationOrderings { NVPTX::OrderingUnderlyingType InstrOrdering; NVPTX::OrderingUnderlyingType FenceOrdering; - OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic, - NVPTX::Ordering f = NVPTX::Ordering::NotAtomic) - : InstrOrdering(static_cast(o)), - FenceOrdering(static_cast(f)) {} + OperationOrderings(NVPTX::Ordering O = NVPTX::Ordering::NotAtomic, + NVPTX::Ordering F = NVPTX::Ordering::NotAtomic) + : InstrOrdering(static_cast(O)), + FenceOrdering(static_cast(F)) {} }; static OperationOrderings @@ -734,6 +734,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // clang-format off // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error). + // Note: uses of Relaxed in the Atomic column of this table refer + // to LLVM AtomicOrdering::Monotonic. // // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ | // |---------|----------|--------------------|------------|------------------------------| @@ -1155,7 +1157,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { case NVPTX::Ordering::SequentiallyConsistent: { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + : NVPTX::INT_MEMBAR_SYS; Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); break; } @@ -1318,7 +1320,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { case NVPTX::Ordering::SequentiallyConsistent: { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + : NVPTX::INT_MEMBAR_SYS; Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); break; } @@ -1990,7 +1992,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { case NVPTX::Ordering::SequentiallyConsistent: { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + : NVPTX::INT_MEMBAR_SYS; Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); break; } @@ -2150,7 +2152,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { case NVPTX::Ordering::SequentiallyConsistent: { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::atomic_thread_fence_seq_cst_sys_membar; + : NVPTX::INT_MEMBAR_SYS; Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); break; } From 41fb361e978826d571697922716768bc88909b11 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 25 Jul 2024 11:58:00 -0700 Subject: [PATCH 08/22] [NVPTX] Update atomic volatile unordered test --- llvm/test/CodeGen/NVPTX/load-store.ll | 36 ++++++++++++++++++--------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index dde5297d0feaa..853b08ee99f0f 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -806,40 +806,52 @@ define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; CHECK-LABEL: global_unordered_volatile define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 ; TODO: LLVM IR Verifier does not support atomics on vector types. From 4cc2825e049beaf03b3109daf105b20e973fd422 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 25 Jul 2024 12:05:01 -0700 Subject: [PATCH 09/22] [NVPTX] Update comment: now handling Unordered --- llvm/lib/Target/NVPTX/NVPTX.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index c9cce23788ca4..1615c50d73384 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -110,7 +110,7 @@ enum LoadStore { using OrderingUnderlyingType = unsigned int; enum class Ordering : OrderingUnderlyingType { NotAtomic = 0, // PTX calls these: "Weak" - // Unordered = 1, // TODO: NVPTX should map this to "Relaxed" + // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed Relaxed = 2, // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" Acquire = 4, From 932a90288dfcc86ae66dc59877aa131137fdb5ff Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 30 Jul 2024 06:05:44 -0700 Subject: [PATCH 10/22] [NVPTX] refactor NVPTX::Ordering to string/stream --- llvm/lib/Target/NVPTX/NVPTX.h | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 1615c50d73384..461e18c790703 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -122,36 +122,32 @@ enum class Ordering : OrderingUnderlyingType { LAST = RelaxedMMIO }; -template OStream &operator<<(OStream &O, Ordering Order) { +inline char const *toCString(Ordering Order) { switch (Order) { case Ordering::NotAtomic: - O << "NotAtomic"; - return O; + return "NotAtomic"; case Ordering::Relaxed: - O << "Relaxed"; - return O; + return "Relaxed"; case Ordering::Acquire: - O << "Acquire"; - return O; + return "Acquire"; case Ordering::Release: - O << "Release"; - return O; - // case Ordering::AcquireRelease: - // O << "AcquireRelease"; - // return O; + return "Release"; + // case Ordering::AcquireRelease: return "AcquireRelease"; case Ordering::SequentiallyConsistent: - O << "SequentiallyConsistent"; - return O; + return "SequentiallyConsistent"; case Ordering::Volatile: - O << "Volatile"; - return O; + return "Volatile"; case Ordering::RelaxedMMIO: - O << "RelaxedMMIO"; - return O; + return "RelaxedMMIO"; } report_fatal_error("unknown ordering"); } +inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { + O << toCString(Order); + return O; +} + namespace PTXLdStInstCode { enum AddressSpace { GENERIC = 0, From 5dc3e0034db658a899e8d5cfabc17bd1a6b0ce3f Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 30 Jul 2024 06:14:30 -0700 Subject: [PATCH 11/22] [NVPTX] Refactor OperationOrderings into anonymous namespace to avoid collisions --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index b1077b3ccacdc..0a408e960521b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -714,6 +714,8 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } +namespace { + struct OperationOrderings { NVPTX::OrderingUnderlyingType InstrOrdering; NVPTX::OrderingUnderlyingType FenceOrdering; @@ -907,11 +909,11 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // This sets the ordering of the fence to SequentiallyConsistent, and // sets the corresponding ordering for the instruction. NVPTX::Ordering InstrOrder; - if (N->readMem()) { + if (N->readMem()) InstrOrder = NVPTX::Ordering::Acquire; - } else if (N->writeMem()) { + else if (N->writeMem()) InstrOrder = NVPTX::Ordering::Release; - } else { + else { SmallString<256> Msg; raw_svector_ostream OS(Msg); OS << "NVPTX does not support SequentiallyConsistent Ordering on " @@ -934,6 +936,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { report_fatal_error(OS.str()); } +} // namespace + static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address From e36d7c48b5f03d20b1b8ffcc5e22f61019214945 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 30 Jul 2024 06:48:11 -0700 Subject: [PATCH 12/22] [NVPTX] Cleanup error reporting --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 99 +++++++------------ 2 files changed, 39 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 3a692feb47ed6..5be56e7e6a04c 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/FormattedStream.h" #include using namespace llvm; @@ -249,12 +250,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, O << ".mmio.relaxed.sys"; break; default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX LdStCode Printer does not support \"" << Ordering - << "\" sem modifier."; - report_fatal_error(OS.str()); - break; + report_fatal_error(formatv( + "NVPTX LdStCode Printer does not support \"{}\" sem modifier.", + toCString(Ordering))); } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0a408e960521b..2370622f19362 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -828,12 +829,11 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { Ordering == AtomicOrdering::Unordered || Ordering == AtomicOrdering::Monotonic) && !HasMemoryOrdering) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX does not support \"atomic\" for orderings different than" - "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \"" - << toIRString(Ordering) << "\"."; - report_fatal_error(OS.str()); + report_fatal_error( + formatv("PTX does not support \"atomic\" for orderings different than" + "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order " + "is: \"{}\".", + toIRString(Ordering))); } // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop @@ -870,35 +870,25 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to // Acquire. case AtomicOrdering::Acquire: - if (!N->readMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Acquire Ordering on reads: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } + if (!N->readMem()) + report_fatal_error( + formatv("PTX only supports Acquire Ordering on reads: {}", + N->getOperationName())); return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Release: - if (!N->writeMem()) { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "PTX only supports Release Ordering on writes: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } + if (!N->writeMem()) + report_fatal_error( + formatv("PTX only supports Release Ordering on writes: {}", + N->getOperationName())); return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release : NVPTX::Ordering::NotAtomic; case AtomicOrdering::AcquireRelease: { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write " - "yet and PTX does not support it on loads or stores: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); + report_fatal_error( + formatv("NVPTX does not support AcquireRelease Ordering on " + "read-modify-write " + "yet and PTX does not support it on loads or stores: {}", + N->getOperationName())); } case AtomicOrdering::SequentiallyConsistent: { // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX @@ -913,27 +903,20 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { InstrOrder = NVPTX::Ordering::Acquire; else if (N->writeMem()) InstrOrder = NVPTX::Ordering::Release; - else { - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX does not support SequentiallyConsistent Ordering on " - "read-modify-writes yet: " - << N->getOperationName(); - N->print(OS); - report_fatal_error(OS.str()); - } + else + report_fatal_error( + formatv("NVPTX does not support SequentiallyConsistent Ordering on " + "read-modify-writes yet: {}", + N->getOperationName())); return AddrGenericOrGlobalOrShared ? OperationOrderings(InstrOrder, NVPTX::Ordering::SequentiallyConsistent) : OperationOrderings(NVPTX::Ordering::NotAtomic); } } - - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "NVPTX backend does not support AtomicOrdering \"" - << toIRString(Ordering) << "\" yet."; - report_fatal_error(OS.str()); + report_fatal_error( + formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.", + toIRString(Ordering))); } } // namespace @@ -1166,11 +1149,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { break; } default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) - << "\"."; - report_fatal_error(OS.str()); + report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", + toCString(NVPTX::Ordering(FenceOrdering)))); } // Type Setting: fromType + fromTypeWidth @@ -1329,11 +1309,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { break; } default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) - << "\"."; - report_fatal_error(OS.str()); + report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", + toCString(NVPTX::Ordering(FenceOrdering)))); } // Vector Setting @@ -2001,11 +1978,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { break; } default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) - << "\"."; - report_fatal_error(OS.str()); + report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", + toCString(NVPTX::Ordering(FenceOrdering)))); } // Vector Setting @@ -2161,11 +2135,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } default: - SmallString<256> Msg; - raw_svector_ostream OS(Msg); - OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering) - << "\"."; - report_fatal_error(OS.str()); + report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", + toCString(NVPTX::Ordering(FenceOrdering)))); } // Type Setting: toType + toTypeWidth From c9a5dd8818ac9601beb66a29de305ad752ab40b0 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 30 Jul 2024 10:05:44 -0700 Subject: [PATCH 13/22] [NVPTX] Cleanup comments in tests --- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 46 ++----- llvm/test/CodeGen/NVPTX/load-store.ll | 126 ++++++-------------- 2 files changed, 45 insertions(+), 127 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 4d3b11094d3a1..9cea33d12027f 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -41,8 +41,6 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -84,8 +82,6 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -139,8 +135,6 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -194,8 +188,6 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -239,8 +231,6 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -282,8 +272,6 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -337,8 +325,6 @@ define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -392,8 +378,6 @@ define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -437,8 +421,6 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -480,8 +462,6 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -535,8 +515,6 @@ define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -590,8 +568,6 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -599,7 +575,8 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK-LABEL: local_acq_rel define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 @@ -637,14 +614,13 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } ; CHECK-LABEL: local_acq_rel_volatile define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 @@ -682,14 +658,13 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } ; CHECK-LABEL: local_seq_cst define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 @@ -727,14 +702,13 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } ; CHECK-LABEL: local_seq_cst_volatile define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 @@ -777,5 +751,5 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ret void } -; TODO: missing .const statespace tests -; TODO: missing .param statespace tests +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces \ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 853b08ee99f0f..81cfb1b6fd493 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -43,13 +43,18 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr %d - ; TODO: should be combined into single .u16 op + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the next tests. This test lowers to a weak PTX + ; vector op, but next test lowers to a vector PTX op. ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load <2 x i8>, ptr %b %h.add = add <2 x i8> %h.load, ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} store <2 x i8> %h.add, ptr %b + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the previous test. This test lowers to a weak + ; PTX scalar op, but prior test lowers to a vector PTX op. ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %i.load = load <4 x i8>, ptr %c %i.add = add <4 x i8> %i.load, @@ -62,28 +67,24 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store <2 x i16> %j.add, ptr %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load <4 x i16>, ptr %d %k.add = add <4 x i16> %k.load, ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store <4 x i16> %k.add, ptr %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load <2 x i32>, ptr %d %l.add = add <2 x i32> %l.load, ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store <2 x i32> %l.add, ptr %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load <4 x i32>, ptr %d %m.add = add <4 x i32> %m.load, ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store <4 x i32> %m.add, ptr %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load <2 x i64>, ptr %d %n.add = add <2 x i64> %n.load, @@ -93,21 +94,18 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store <2 x float> %o.add, ptr %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load <4 x float>, ptr %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store <4 x float> %p.add, ptr %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load <2 x double>, ptr %d %q.add = fadd <2 x double> %q.load, @@ -155,20 +153,37 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr %c + ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. + ; Currently, LLVM: + ; - does not allow atomic operations on vectors. + ; - it allows volatile operations but not clear what that means. + ; Following both semantics make sense in general and PTX supports both: + ; - volatile/atomic/volatile atomic applies to the whole vector + ; - volatile/atomic/volatile atomic applies elementwise + ; Actions required: + ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those + ; Below tests show that the current implementation picks the semantics in an inconsistent way + ; * volatile <2 x i8> lowers to "elementwise volatile" + ; * <4 x i8> lowers to "full vector volatile" + ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics + ; - update tests in load-store-sm70.ll as well. + + ; TODO: make this operation consistent with the one for <4 x i8> + ; This operation lowers to a "element wise volatile PTX operation". ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load volatile <2 x i8>, ptr %b %h.add = add <2 x i8> %h.load, ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} store volatile <2 x i8> %h.add, ptr %b - ; TODO: should NOT be combined into a single .u32 op + ; TODO: make this operation consistent with the one for <2 x i8> + ; This operation lowers to a "full vector volatile PTX operation". ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %i.load = load volatile <4 x i8>, ptr %c %i.add = add <4 x i8> %i.load, ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store volatile <4 x i8> %i.add, ptr %c - ; TODO: should NOT be combined into a single .u32 op ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %j.load = load volatile <2 x i16>, ptr %c %j.add = add <2 x i16> %j.load, @@ -273,8 +288,6 @@ define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unn ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -316,8 +329,6 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -371,8 +382,6 @@ define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unn ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -414,8 +423,6 @@ define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -459,7 +466,6 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(1) %c - ; TODO: should be combined into single .u16 op ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load <2 x i8>, ptr addrspace(1) %b %h.add = add <2 x i8> %h.load, @@ -478,28 +484,24 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store <2 x i16> %j.add, ptr addrspace(1) %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load <4 x i16>, ptr addrspace(1) %d %k.add = add <4 x i16> %k.load, ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store <4 x i16> %k.add, ptr addrspace(1) %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load <2 x i32>, ptr addrspace(1) %d %l.add = add <2 x i32> %l.load, ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store <2 x i32> %l.add, ptr addrspace(1) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load <4 x i32>, ptr addrspace(1) %d %m.add = add <4 x i32> %m.load, ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store <4 x i32> %m.add, ptr addrspace(1) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load <2 x i64>, ptr addrspace(1) %d %n.add = add <2 x i64> %n.load, @@ -509,21 +511,18 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(1) %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store <2 x float> %o.add, ptr addrspace(1) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load <4 x float>, ptr addrspace(1) %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store <4 x float> %p.add, ptr addrspace(1) %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load <2 x double>, ptr addrspace(1) %d %q.add = fadd <2 x double> %q.load, @@ -577,14 +576,12 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} store volatile<2 x i8> %h.add, ptr addrspace(1) %b - ; TODO: should NOT be combined into single .u32 op ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %i.load = load volatile <4 x i8>, ptr addrspace(1) %c %i.add = add <4 x i8> %i.load, ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store volatile<4 x i8> %i.add, ptr addrspace(1) %c - ; TODO: should NOT be combined into single .u32 op ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %j.load = load volatile <2 x i16>, ptr addrspace(1) %c %j.add = add <2 x i16> %j.load, @@ -689,8 +686,6 @@ define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -744,8 +739,6 @@ define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -799,8 +792,6 @@ define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -854,8 +845,6 @@ define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -899,7 +888,6 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(3) %c - ; TODO: should be combined into single .u16 op ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load <2 x i8>, ptr addrspace(3) %b %h.add = add <2 x i8> %h.load, @@ -918,28 +906,24 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store <2 x i16> %j.add, ptr addrspace(3) %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load <4 x i16>, ptr addrspace(3) %d %k.add = add <4 x i16> %k.load, ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store <4 x i16> %k.add, ptr addrspace(3) %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load <2 x i32>, ptr addrspace(3) %d %l.add = add <2 x i32> %l.load, ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store <2 x i32> %l.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load <4 x i32>, ptr addrspace(3) %d %m.add = add <4 x i32> %m.load, ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store <4 x i32> %m.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load <2 x i64>, ptr addrspace(3) %d %n.add = add <2 x i64> %n.load, @@ -949,21 +933,18 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(3) %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store <2 x float> %o.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load <4 x float>, ptr addrspace(3) %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store <4 x float> %p.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load <2 x double>, ptr addrspace(3) %d %q.add = fadd <2 x double> %q.load, @@ -1011,7 +992,6 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(3) %c - ; TODO: should be combined into single .u16 op ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load volatile <2 x i8>, ptr addrspace(3) %b %h.add = add <2 x i8> %h.load, @@ -1030,28 +1010,24 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store volatile <2 x i16> %j.add, ptr addrspace(3) %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load volatile <4 x i16>, ptr addrspace(3) %d %k.add = add <4 x i16> %k.load, ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store volatile <4 x i16> %k.add, ptr addrspace(3) %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load volatile <2 x i32>, ptr addrspace(3) %d %l.add = add <2 x i32> %l.load, ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store volatile <2 x i32> %l.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load volatile <4 x i32>, ptr addrspace(3) %d %m.add = add <4 x i32> %m.load, ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store volatile <4 x i32> %m.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load volatile <2 x i64>, ptr addrspace(3) %d %n.add = add <2 x i64> %n.load, @@ -1061,21 +1037,18 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr addrspace(3) %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store volatile <2 x float> %o.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load volatile <4 x float>, ptr addrspace(3) %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store volatile <4 x float> %p.add, ptr addrspace(3) %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load volatile <2 x double>, ptr addrspace(3) %d %q.add = fadd <2 x double> %q.load, @@ -1087,7 +1060,7 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK-LABEL: shared_monotonic define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: in some cases it may be valid to optimize .sys.shared to .cta.shared or .cluster.shared. + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] @@ -1137,8 +1110,6 @@ define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addr ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -1180,14 +1151,12 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } ; CHECK-LABEL: shared_unordered define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: in some cases it may be valid to optimize .sys.shared to .cta.shared or .cluster.shared. + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] @@ -1237,8 +1206,6 @@ define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addr ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -1280,8 +1247,6 @@ define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -1325,7 +1290,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(5) %c - ; TODO: should be combined into single .u16 op ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load <2 x i8>, ptr addrspace(5) %b %h.add = add <2 x i8> %h.load, @@ -1344,28 +1308,24 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store <2 x i16> %j.add, ptr addrspace(5) %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load <4 x i16>, ptr addrspace(5) %d %k.add = add <4 x i16> %k.load, ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store <4 x i16> %k.add, ptr addrspace(5) %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load <2 x i32>, ptr addrspace(5) %d %l.add = add <2 x i32> %l.load, ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store <2 x i32> %l.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load <4 x i32>, ptr addrspace(5) %d %m.add = add <4 x i32> %m.load, ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store <4 x i32> %m.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load <2 x i64>, ptr addrspace(5) %d %n.add = add <2 x i64> %n.load, @@ -1375,21 +1335,18 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(5) %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store <2 x float> %o.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load <4 x float>, ptr addrspace(5) %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store <4 x float> %p.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load <2 x double>, ptr addrspace(5) %d %q.add = fadd <2 x double> %q.load, @@ -1401,7 +1358,8 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using volatile operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a @@ -1439,7 +1397,6 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(5) %c - ; TODO: should be combined into single .u16 op ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %h.load = load volatile <2 x i8>, ptr addrspace(5) %b %h.add = add <2 x i8> %h.load, @@ -1458,28 +1415,24 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store volatile <2 x i16> %j.add, ptr addrspace(5) %c - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] %k.load = load volatile <4 x i16>, ptr addrspace(5) %d %k.add = add <4 x i16> %k.load, ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} store volatile <4 x i16> %k.add, ptr addrspace(5) %d - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %l.load = load volatile <2 x i32>, ptr addrspace(5) %d %l.add = add <2 x i32> %l.load, ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} store volatile <2 x i32> %l.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] %m.load = load volatile <4 x i32>, ptr addrspace(5) %d %m.add = add <4 x i32> %m.load, ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} store volatile <4 x i32> %m.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] %n.load = load volatile <2 x i64>, ptr addrspace(5) %d %n.add = add <2 x i64> %n.load, @@ -1489,21 +1442,18 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: should be combined into single .u64 op ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr addrspace(5) %d %o.add = fadd <2 x float> %o.load, ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} store volatile <2 x float> %o.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op in sm_70+ ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %p.load = load volatile <4 x float>, ptr addrspace(5) %d %p.add = fadd <4 x float> %p.load, ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} store volatile <4 x float> %p.add, ptr addrspace(5) %d - ; TODO: should be combined into single .b128 op ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] %q.load = load volatile <2 x double>, ptr addrspace(5) %d %q.add = fadd <2 x double> %q.load, @@ -1515,7 +1465,8 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK-LABEL: local_monotonic define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 @@ -1553,14 +1504,13 @@ define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } ; CHECK-LABEL: local_monotonic_volatile define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: this codegen looses Concurrent Forward Progress + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by generating atomic or volatile operations ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 @@ -1598,8 +1548,6 @@ define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -1641,8 +1589,6 @@ define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } @@ -1684,10 +1630,8 @@ define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. - ret void } -; TODO: missing .const statespace tests -; TODO: missing .param statespace tests +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces \ No newline at end of file From e727c76fe246d186eafec2613d2058c118a55d37 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 11 Jul 2024 21:27:14 +0200 Subject: [PATCH 14/22] [NVPTX] Refactor and clean up load,tryLoad,tryStore a bit --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 270 ++++++++------------ 1 file changed, 103 insertions(+), 167 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 2370622f19362..8bff731402563 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1109,17 +1109,15 @@ static int getLdStRegType(EVT VT) { } bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { - SDLoc dl(N); MemSDNode *LD = cast(N); assert(LD->readMem() && "Expected load"); - LoadSDNode *PlainLoad = dyn_cast(N); - EVT LoadedVT = LD->getMemoryVT(); - SDNode *NVPTXLD = nullptr; // do not support pre/post inc/dec + LoadSDNode *PlainLoad = dyn_cast(N); if (PlainLoad && PlainLoad->isIndexed()) return false; + EVT LoadedVT = LD->getMemoryVT(); if (!LoadedVT.isSimple()) return false; @@ -1137,6 +1135,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); // If a fence is required before the operation, insert it: + SDLoc DL(N); SDValue Chain = N->getOperand(0); switch (NVPTX::Ordering(FenceOrdering)) { case NVPTX::Ordering::NotAtomic: @@ -1145,7 +1144,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); break; } default: @@ -1162,22 +1161,22 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { MVT SimpleVT = LoadedVT.getSimpleVT(); MVT ScalarVT = SimpleVT.getScalarType(); // Read at least 8 bits (predicates are stored as 8-bit values) - unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); - unsigned int fromType; + unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); + unsigned int FromType; // Vector Setting - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && "Unexpected vector type"); // v2f16/v2bf16/v2i16 is loaded using ld.b32 - fromTypeWidth = 32; + FromTypeWidth = 32; } if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) - fromType = NVPTX::PTXLdStInstCode::Signed; + FromType = NVPTX::PTXLdStInstCode::Signed; else - fromType = getLdStRegType(ScalarVT); + FromType = getLdStRegType(ScalarVT); // Create the machine instruction DAG SDValue N1 = N->getOperand(1); @@ -1186,20 +1185,19 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { std::optional Opcode; MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; + SmallVector Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(N1, Addr)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Addr, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.push_back(Addr); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, @@ -1207,15 +1205,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { if (PointerSize == 64) @@ -1229,15 +1221,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else { if (PointerSize == 64) Opcode = @@ -1250,16 +1236,12 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(fromType, dl), - getI32Imm(fromTypeWidth, dl), - N1, - Chain}; - NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); + Ops.push_back(N1); + Ops.push_back(Chain); } + SDNode *NVPTXLD = + CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops); if (!NVPTXLD) return false; @@ -1271,16 +1253,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { - - SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue Addr, Offset, Base; - std::optional Opcode; - SDLoc DL(N); - SDNode *LD; MemSDNode *MemSD = cast(N); EVT LoadedVT = MemSD->getMemoryVT(); - if (!LoadedVT.isSimple()) return false; @@ -1298,6 +1272,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { getOperationOrderings(MemSD, Subtarget); // If a fence is required before the operation, insert it: + SDLoc DL(N); + SDValue Chain = N->getOperand(0); switch (NVPTX::Ordering(FenceOrdering)) { case NVPTX::Ordering::NotAtomic: break; @@ -1359,6 +1335,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { FromTypeWidth = 32; } + SDValue Op1 = N->getOperand(1); + SDValue Addr, Offset, Base; + std::optional Opcode; + SDNode *LD; + + SmallVector Ops({getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(FromType, DL), + getI32Imm(FromTypeWidth, DL)}); + if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: @@ -1378,14 +1364,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Addr, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.push_back(Addr); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { @@ -1407,15 +1387,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { @@ -1457,16 +1431,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Base, - Offset, - Chain}; - - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -1507,15 +1474,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - SDValue Ops[] = {getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(FromType, DL), - getI32Imm(FromTypeWidth, DL), - Op1, - Chain}; - LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); + Ops.push_back(Op1); + Ops.push_back(Chain); } + LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); MachineMemOperand *MemRef = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(LD), {MemRef}); @@ -1525,8 +1487,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { - - SDValue Chain = N->getOperand(0); SDValue Op1; MemSDNode *Mem; bool IsLDG = true; @@ -1556,12 +1516,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Mem = cast(N); } - std::optional Opcode; - SDLoc DL(N); - SDNode *LD; - SDValue Base, Offset, Addr; EVT OrigType = N->getValueType(0); - EVT EltVT = Mem->getMemoryVT(); unsigned NumElts = 1; if (EltVT.isVector()) { @@ -1590,6 +1545,12 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } InstVTs.push_back(MVT::Other); SDVTList InstVTList = CurDAG->getVTList(InstVTs); + SDValue Chain = N->getOperand(0); + + std::optional Opcode; + SDLoc DL(N); + SDNode *LD; + SDValue Base, Offset, Addr; if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { @@ -1940,19 +1901,17 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { - SDLoc dl(N); MemSDNode *ST = cast(N); assert(ST->writeMem() && "Expected store"); StoreSDNode *PlainStore = dyn_cast(N); AtomicSDNode *AtomicStore = dyn_cast(N); assert((PlainStore || AtomicStore) && "Expected store"); - EVT StoreVT = ST->getMemoryVT(); - SDNode *NVPTXST = nullptr; // do not support pre/post inc/dec if (PlainStore && PlainStore->isIndexed()) return false; + EVT StoreVT = ST->getMemoryVT(); if (!StoreVT.isSimple()) return false; @@ -1966,6 +1925,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { getOperationOrderings(ST, Subtarget); // If a fence is required before the operation, insert it: + SDLoc DL(N); SDValue Chain = ST->getChain(); switch (NVPTX::Ordering(FenceOrdering)) { case NVPTX::Ordering::NotAtomic: @@ -1974,7 +1934,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { unsigned Op = Subtarget->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0); + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); break; } default: @@ -1984,21 +1944,20 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); - unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; + unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' - // MVT ScalarVT = SimpleVT.getScalarType(); - unsigned toTypeWidth = ScalarVT.getSizeInBits(); + unsigned ToTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && "Unexpected vector type"); // v2x16 is stored using st.b32 - toTypeWidth = 32; + ToTypeWidth = 32; } - unsigned int toType = getLdStRegType(ScalarVT); + unsigned int ToType = getLdStRegType(ScalarVT); // Create the machine instruction DAG SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); @@ -2009,21 +1968,19 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT::SimpleValueType SourceVT = Value.getNode()->getSimpleValueType(0).SimpleTy; + SmallVector Ops({Value, getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); + if (SelectDirectAddr(BasePtr, Addr)) { Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Addr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.push_back(Addr); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -2032,16 +1989,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else if (PointerSize == 64 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -2056,17 +2006,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); if (!Opcode) return false; - - SDValue Ops[] = {Value, - getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - Base, - Offset, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.push_back(Base); + Ops.push_back(Offset); + Ops.push_back(Chain); } else { if (PointerSize == 64) Opcode = @@ -2079,17 +2021,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - SDValue Ops[] = {Value, - getI32Imm(InstructionOrdering, dl), - getI32Imm(CodeAddrSpace, dl), - getI32Imm(vecType, dl), - getI32Imm(toType, dl), - getI32Imm(toTypeWidth, dl), - BasePtr, - Chain}; - NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); + Ops.push_back(BasePtr); + Ops.push_back(Chain); } + SDNode *NVPTXST = NVPTXST = + CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); + if (!NVPTXST) return false; @@ -2146,23 +2084,23 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned ToTypeWidth = ScalarVT.getSizeInBits(); unsigned ToType = getLdStRegType(ScalarVT); - SmallVector StOps; + SmallVector Ops; SDValue N2; unsigned VecType; switch (N->getOpcode()) { case NVPTXISD::StoreV2: VecType = NVPTX::PTXLdStInstCode::V2; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); N2 = N->getOperand(3); break; case NVPTXISD::StoreV4: VecType = NVPTX::PTXLdStInstCode::V4; - StOps.push_back(N->getOperand(1)); - StOps.push_back(N->getOperand(2)); - StOps.push_back(N->getOperand(3)); - StOps.push_back(N->getOperand(4)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + Ops.push_back(N->getOperand(3)); + Ops.push_back(N->getOperand(4)); N2 = N->getOperand(5); break; default: @@ -2179,11 +2117,11 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - StOps.push_back(getI32Imm(InstructionOrdering, DL)); - StOps.push_back(getI32Imm(CodeAddrSpace, DL)); - StOps.push_back(getI32Imm(VecType, DL)); - StOps.push_back(getI32Imm(ToType, DL)); - StOps.push_back(getI32Imm(ToTypeWidth, DL)); + Ops.push_back(getI32Imm(InstructionOrdering, DL)); + Ops.push_back(getI32Imm(CodeAddrSpace, DL)); + Ops.push_back(getI32Imm(VecType, DL)); + Ops.push_back(getI32Imm(ToType, DL)); + Ops.push_back(getI32Imm(ToTypeWidth, DL)); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -2202,7 +2140,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4_avar, std::nullopt); break; } - StOps.push_back(Addr); + Ops.push_back(Addr); } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { switch (N->getOpcode()) { @@ -2221,8 +2159,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); break; } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.push_back(Base); + Ops.push_back(Offset); } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) : SelectADDRri(N2.getNode(), N2, Base, Offset)) { if (PointerSize == 64) { @@ -2261,8 +2199,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(Base); - StOps.push_back(Offset); + Ops.push_back(Base); + Ops.push_back(Offset); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -2301,15 +2239,15 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - StOps.push_back(N2); + Ops.push_back(N2); } if (!Opcode) return false; - StOps.push_back(Chain); + Ops.push_back(Chain); - ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); + ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); MachineMemOperand *MemRef = cast(N)->getMemOperand(); CurDAG->setNodeMemRefs(cast(ST), {MemRef}); @@ -2383,10 +2321,8 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { unsigned OffsetVal = Offset->getAsZExtVal(); - SmallVector Ops; - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + SmallVector Ops( + {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); return true; From 7ec8da60854430bf963015667fb40190f022bb65 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 31 Jul 2024 14:24:46 -0700 Subject: [PATCH 15/22] [NVPTX] Simplify NVPTX::Ordering by not making it an enum class --- .../lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 +-- llvm/lib/Target/NVPTX/NVPTX.h | 5 +++-- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 10 ++++------ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 5be56e7e6a04c..581e2bc03ff89 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -229,8 +229,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int) MO.getImm(); if (!strcmp(Modifier, "sem")) { - auto Ordering = - NVPTX::Ordering(static_cast(Imm)); + auto Ordering = NVPTX::Ordering(Imm); switch (Ordering) { case NVPTX::Ordering::NotAtomic: break; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 461e18c790703..27a8e69b8e718 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -106,9 +106,10 @@ enum LoadStore { isStoreShift = 6 }; -// Extends LLVM AtomicOrdering with PTX Orderings: +// Extends LLVM AtomicOrdering with PTX Orderings. +// Values match LLVM AtomicOrdering for common orderings. using OrderingUnderlyingType = unsigned int; -enum class Ordering : OrderingUnderlyingType { +enum Ordering : OrderingUnderlyingType { NotAtomic = 0, // PTX calls these: "Weak" // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed Relaxed = 2, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 8bff731402563..a3ab30695589b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -718,12 +718,10 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { namespace { struct OperationOrderings { - NVPTX::OrderingUnderlyingType InstrOrdering; - NVPTX::OrderingUnderlyingType FenceOrdering; - OperationOrderings(NVPTX::Ordering O = NVPTX::Ordering::NotAtomic, - NVPTX::Ordering F = NVPTX::Ordering::NotAtomic) - : InstrOrdering(static_cast(O)), - FenceOrdering(static_cast(F)) {} + NVPTX::Ordering InstructionOrdering, FenceOrdering; + OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic, + NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic) + : InstructionOrdering(IO), FenceOrdering(FO) {} }; static OperationOrderings From 8b3e4505113788951e1a5e54b6ba2bfb5d26fc0c Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 31 Jul 2024 14:39:33 -0700 Subject: [PATCH 16/22] [NVPTX] Move Ordering to string functions to utilities --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 1 + llvm/lib/Target/NVPTX/NVPTX.h | 26 ---------------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 31 +++++++++++++++++++ 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 581e2bc03ff89..52ddff875ab23 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -13,6 +13,7 @@ #include "MCTargetDesc/NVPTXInstPrinter.h" #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" +#include "NVPTXUtilities.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 27a8e69b8e718..ddb208597f61a 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -123,32 +123,6 @@ enum Ordering : OrderingUnderlyingType { LAST = RelaxedMMIO }; -inline char const *toCString(Ordering Order) { - switch (Order) { - case Ordering::NotAtomic: - return "NotAtomic"; - case Ordering::Relaxed: - return "Relaxed"; - case Ordering::Acquire: - return "Acquire"; - case Ordering::Release: - return "Release"; - // case Ordering::AcquireRelease: return "AcquireRelease"; - case Ordering::SequentiallyConsistent: - return "SequentiallyConsistent"; - case Ordering::Volatile: - return "Volatile"; - case Ordering::RelaxedMMIO: - return "RelaxedMMIO"; - } - report_fatal_error("unknown ordering"); -} - -inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { - O << toCString(Order); - return O; -} - namespace PTXLdStInstCode { enum AddressSpace { GENERIC = 0, diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index c15ff6cae1f27..72b5497b33b44 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTX.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -82,6 +83,36 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); bool Isv2x16VT(EVT VT); + +namespace NVPTX { + +inline char const *toCString(Ordering Order) { + switch (Order) { + case Ordering::NotAtomic: + return "NotAtomic"; + case Ordering::Relaxed: + return "Relaxed"; + case Ordering::Acquire: + return "Acquire"; + case Ordering::Release: + return "Release"; + // case Ordering::AcquireRelease: return "AcquireRelease"; + case Ordering::SequentiallyConsistent: + return "SequentiallyConsistent"; + case Ordering::Volatile: + return "Volatile"; + case Ordering::RelaxedMMIO: + return "RelaxedMMIO"; + } + report_fatal_error("unknown ordering"); +} + +inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { + O << toCString(Order); + return O; +} + +} // namespace NVPTX } #endif From 2d8ab2087d9509ef569be8fc221170d16c72ffd8 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 31 Jul 2024 14:49:46 -0700 Subject: [PATCH 17/22] [NVPTX] Improve comments in load-store tests --- llvm/test/CodeGen/NVPTX/load-store.ll | 30 ++++++--------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 81cfb1b6fd493..aac73f71a6766 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -3,6 +3,12 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + ; generic statespace ; CHECK-LABEL: generic_plain @@ -91,9 +97,6 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store <2 x i64> %n.add, ptr %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr %d %o.add = fadd <2 x float> %o.load, @@ -214,9 +217,6 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store volatile <2 x i64> %n.add, ptr %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr %d %o.add = fadd <2 x float> %o.load, @@ -508,9 +508,6 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store <2 x i64> %n.add, ptr addrspace(1) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(1) %d %o.add = fadd <2 x float> %o.load, @@ -612,9 +609,6 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store volatile<2 x i64> %n.add, ptr addrspace(1) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr addrspace(1) %d %o.add = fadd <2 x float> %o.load, @@ -930,9 +924,6 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store <2 x i64> %n.add, ptr addrspace(3) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(3) %d %o.add = fadd <2 x float> %o.load, @@ -1034,9 +1025,6 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store volatile <2 x i64> %n.add, ptr addrspace(3) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr addrspace(3) %d %o.add = fadd <2 x float> %o.load, @@ -1332,9 +1320,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store <2 x i64> %n.add, ptr addrspace(5) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load <2 x float>, ptr addrspace(5) %d %o.add = fadd <2 x float> %o.load, @@ -1439,9 +1424,6 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} store volatile <2 x i64> %n.add, ptr addrspace(5) %d - ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors - ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] %o.load = load volatile <2 x float>, ptr addrspace(5) %d %o.add = fadd <2 x float> %o.load, From 3df9d668a638d491606a448191d977c7b5213a20 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 31 Jul 2024 16:15:34 -0700 Subject: [PATCH 18/22] [NVPTX] Refactor fence insertion for loads/stores --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 114 ++++++-------------- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 + 2 files changed, 37 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index a3ab30695589b..e41ccea0f748d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -961,6 +961,34 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, }); } +NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, + SDValue &Chain, + MemSDNode *N) { + // Some memory instructions - loads, stores, atomics - need an extra fence + // instruction. Get the memory order of the instruction, and that of its + // fence, if any. + auto [InstructionOrdering, FenceOrdering] = + getOperationOrderings(N, Subtarget); + + // If a fence is required before the operation, insert it: + switch (NVPTX::Ordering(FenceOrdering)) { + case NVPTX::Ordering::NotAtomic: + break; + case NVPTX::Ordering::SequentiallyConsistent: { + unsigned Op = Subtarget->hasMemoryOrdering() + ? NVPTX::atomic_thread_fence_seq_cst_sys + : NVPTX::INT_MEMBAR_SYS; + Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); + break; + } + default: + report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", + toCString(NVPTX::Ordering(FenceOrdering)))); + } + + return InstructionOrdering; +} + bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { unsigned IID = N->getConstantOperandVal(0); switch (IID) { @@ -1124,31 +1152,12 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - - // Memory Semantic Setting - auto [InstructionOrdering, FenceOrdering] = - getOperationOrderings(LD, Subtarget); - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); - // If a fence is required before the operation, insert it: SDLoc DL(N); SDValue Chain = N->getOperand(0); - switch (NVPTX::Ordering(FenceOrdering)) { - case NVPTX::Ordering::NotAtomic: - break; - case NVPTX::Ordering::SequentiallyConsistent: { - unsigned Op = Subtarget->hasMemoryOrdering() - ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); - break; - } - default: - report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - toCString(NVPTX::Ordering(FenceOrdering)))); - } + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD); // Type Setting: fromType + fromTypeWidth // @@ -1261,31 +1270,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { return tryLDGLDU(N); } - unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - auto [InstructionOrdering, FenceOrdering] = - getOperationOrderings(MemSD, Subtarget); - - // If a fence is required before the operation, insert it: SDLoc DL(N); SDValue Chain = N->getOperand(0); - switch (NVPTX::Ordering(FenceOrdering)) { - case NVPTX::Ordering::NotAtomic: - break; - case NVPTX::Ordering::SequentiallyConsistent: { - unsigned Op = Subtarget->hasMemoryOrdering() - ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); - break; - } - default: - report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - toCString(NVPTX::Ordering(FenceOrdering)))); - } + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Vector Setting MVT SimpleVT = LoadedVT.getSimpleVT(); @@ -1918,27 +1908,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); - // Memory Semantic Setting - auto [InstructionOrdering, FenceOrdering] = - getOperationOrderings(ST, Subtarget); - - // If a fence is required before the operation, insert it: SDLoc DL(N); SDValue Chain = ST->getChain(); - switch (NVPTX::Ordering(FenceOrdering)) { - case NVPTX::Ordering::NotAtomic: - break; - case NVPTX::Ordering::SequentiallyConsistent: { - unsigned Op = Subtarget->hasMemoryOrdering() - ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); - break; - } - default: - report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - toCString(NVPTX::Ordering(FenceOrdering)))); - } + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting MVT SimpleVT = StoreVT.getSimpleVT(); @@ -2036,11 +2008,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { } bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { - SDValue Chain = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Addr, Offset, Base; std::optional Opcode; - SDLoc DL(N); SDNode *ST; EVT EltVT = Op1.getValueType(); MemSDNode *MemSD = cast(N); @@ -2055,25 +2025,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned int PointerSize = CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); - // Memory Semantic Setting - auto [InstructionOrdering, FenceOrdering] = - getOperationOrderings(MemSD, Subtarget); - - // If a fence is required before the operation, insert it: - switch (NVPTX::Ordering(FenceOrdering)) { - case NVPTX::Ordering::NotAtomic: - break; - case NVPTX::Ordering::SequentiallyConsistent: { - unsigned Op = Subtarget->hasMemoryOrdering() - ? NVPTX::atomic_thread_fence_seq_cst_sys - : NVPTX::INT_MEMBAR_SYS; - Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0); - break; - } - default: - report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - toCString(NVPTX::Ordering(FenceOrdering)))); - } + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD); // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 49626d4051485..eac4056599511 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -99,6 +99,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N); + + NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, + MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { From 20be14b6203f80bf023632f528ef5f3932627f7c Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Wed, 31 Jul 2024 17:09:45 -0700 Subject: [PATCH 19/22] [NVPTX] Refactor memory ordering --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 30 +++++++++------------ 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index e41ccea0f748d..086375c03733c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -845,26 +845,26 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC || CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL || CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED); + if (!AddrGenericOrGlobalOrShared) + return NVPTX::Ordering::NotAtomic; + bool UseRelaxedMMIO = HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL; switch (Ordering) { case AtomicOrdering::NotAtomic: - return N->isVolatile() && AddrGenericOrGlobalOrShared - ? NVPTX::Ordering::Volatile - : NVPTX::Ordering::NotAtomic; + return N->isVolatile() ? NVPTX::Ordering::Volatile + : NVPTX::Ordering::NotAtomic; case AtomicOrdering::Unordered: // We lower unordered in the exact same way as 'monotonic' to respect // LLVM IR atomicity requirements. case AtomicOrdering::Monotonic: if (N->isVolatile()) - return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO - : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile - : NVPTX::Ordering::NotAtomic; + return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO + : NVPTX::Ordering::Volatile; else - return HasMemoryOrdering ? NVPTX::Ordering::Relaxed - : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile - : NVPTX::Ordering::NotAtomic; + return HasMemoryOrdering ? NVPTX::Ordering::Relaxed + : NVPTX::Ordering::Volatile; // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to // Acquire. case AtomicOrdering::Acquire: @@ -872,15 +872,13 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { report_fatal_error( formatv("PTX only supports Acquire Ordering on reads: {}", N->getOperationName())); - return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire - : NVPTX::Ordering::NotAtomic; + return NVPTX::Ordering::Acquire; case AtomicOrdering::Release: if (!N->writeMem()) report_fatal_error( formatv("PTX only supports Release Ordering on writes: {}", N->getOperationName())); - return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release - : NVPTX::Ordering::NotAtomic; + return NVPTX::Ordering::Release; case AtomicOrdering::AcquireRelease: { report_fatal_error( formatv("NVPTX does not support AcquireRelease Ordering on " @@ -906,10 +904,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { formatv("NVPTX does not support SequentiallyConsistent Ordering on " "read-modify-writes yet: {}", N->getOperationName())); - return AddrGenericOrGlobalOrShared - ? OperationOrderings(InstrOrder, - NVPTX::Ordering::SequentiallyConsistent) - : OperationOrderings(NVPTX::Ordering::NotAtomic); + return OperationOrderings(InstrOrder, + NVPTX::Ordering::SequentiallyConsistent); } } report_fatal_error( From e865fc3f5c0c51d60dfd1f74bf47ba72374e7c0f Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 1 Aug 2024 13:17:00 -0700 Subject: [PATCH 20/22] [NVPTX] Refactor and cleanups --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTX.h | 10 ++- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 80 +++++++------------ llvm/lib/Target/NVPTX/NVPTXUtilities.h | 4 +- 4 files changed, 39 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 52ddff875ab23..5b568b0487b45 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -252,7 +252,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, default: report_fatal_error(formatv( "NVPTX LdStCode Printer does not support \"{}\" sem modifier.", - toCString(Ordering))); + OrderingToCString(Ordering))); } } else if (!strcmp(Modifier, "addsp")) { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index ddb208597f61a..1b9bf1e0616ca 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -16,6 +16,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" namespace llvm { @@ -106,8 +107,7 @@ enum LoadStore { isStoreShift = 6 }; -// Extends LLVM AtomicOrdering with PTX Orderings. -// Values match LLVM AtomicOrdering for common orderings. +// Extends LLVM AtomicOrdering with PTX Orderings: using OrderingUnderlyingType = unsigned int; enum Ordering : OrderingUnderlyingType { NotAtomic = 0, // PTX calls these: "Weak" @@ -122,6 +122,12 @@ enum Ordering : OrderingUnderlyingType { RelaxedMMIO = 9, LAST = RelaxedMMIO }; +// Values match LLVM AtomicOrdering for common orderings: +static_assert(Ordering::NotAtomic == (unsigned)AtomicOrdering::NotAtomic); +static_assert(Ordering::Relaxed == (unsigned)AtomicOrdering::Monotonic); +static_assert(Ordering::Acquire == (unsigned)AtomicOrdering::Acquire); +static_assert(Ordering::Release == (unsigned)AtomicOrdering::Release); +static_assert(Ordering::SequentiallyConsistent == (unsigned)AtomicOrdering::SequentiallyConsistent); namespace PTXLdStInstCode { enum AddressSpace { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 086375c03733c..d2dc37fc2a5e6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -979,7 +979,7 @@ NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, } default: report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - toCString(NVPTX::Ordering(FenceOrdering)))); + OrderingToCString(NVPTX::Ordering(FenceOrdering)))); } return InstructionOrdering; @@ -1199,8 +1199,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); if (!Opcode) return false; - Ops.push_back(Addr); - Ops.push_back(Chain); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, @@ -1208,9 +1207,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) : SelectADDRri(N1.getNode(), N1, Base, Offset)) { if (PointerSize == 64) @@ -1224,9 +1221,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1239,8 +1234,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); if (!Opcode) return false; - Ops.push_back(N1); - Ops.push_back(Chain); + Ops.append({N1, Chain}); } SDNode *NVPTXLD = @@ -1348,8 +1342,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - Ops.push_back(Addr); - Ops.push_back(Chain); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { @@ -1371,9 +1364,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { @@ -1415,9 +1406,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -1458,8 +1447,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (!Opcode) return false; - Ops.push_back(Op1); - Ops.push_back(Chain); + Ops.append({Op1, Chain}); } LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); @@ -1945,8 +1933,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); if (!Opcode) return false; - Ops.push_back(Addr); - Ops.push_back(Chain); + Ops.append({Addr, Chain}); } else if (PointerSize == 64 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1955,9 +1942,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else if (PointerSize == 64 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { @@ -1972,9 +1957,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); if (!Opcode) return false; - Ops.push_back(Base); - Ops.push_back(Offset); - Ops.push_back(Chain); + Ops.append({Base, Offset, Chain}); } else { if (PointerSize == 64) Opcode = @@ -1987,8 +1970,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); if (!Opcode) return false; - Ops.push_back(BasePtr); - Ops.push_back(Chain); + Ops.append({BasePtr, Chain}); } SDNode *NVPTXST = NVPTXST = @@ -2039,16 +2021,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { switch (N->getOpcode()) { case NVPTXISD::StoreV2: VecType = NVPTX::PTXLdStInstCode::V2; - Ops.push_back(N->getOperand(1)); - Ops.push_back(N->getOperand(2)); + Ops.append({N->getOperand(1), N->getOperand(2)}); N2 = N->getOperand(3); break; case NVPTXISD::StoreV4: VecType = NVPTX::PTXLdStInstCode::V4; - Ops.push_back(N->getOperand(1)); - Ops.push_back(N->getOperand(2)); - Ops.push_back(N->getOperand(3)); - Ops.push_back(N->getOperand(4)); + Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4)}); N2 = N->getOperand(5); break; default: @@ -2065,11 +2043,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - Ops.push_back(getI32Imm(InstructionOrdering, DL)); - Ops.push_back(getI32Imm(CodeAddrSpace, DL)); - Ops.push_back(getI32Imm(VecType, DL)); - Ops.push_back(getI32Imm(ToType, DL)); - Ops.push_back(getI32Imm(ToTypeWidth, DL)); + Ops.append({ + getI32Imm(InstructionOrdering, DL), + getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), + getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -2107,8 +2086,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); break; } - Ops.push_back(Base); - Ops.push_back(Offset); + Ops.append({Base, Offset}); } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) : SelectADDRri(N2.getNode(), N2, Base, Offset)) { if (PointerSize == 64) { @@ -2147,8 +2125,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; } } - Ops.push_back(Base); - Ops.push_back(Offset); + Ops.append({Base, Offset}); } else { if (PointerSize == 64) { switch (N->getOpcode()) { @@ -2303,8 +2280,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 2)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); + Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in @@ -2484,10 +2460,10 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 3)); - Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); - Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); - Ops.push_back(Chain); - Ops.push_back(Glue); + Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), + CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), + Chain, + Glue}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 72b5497b33b44..23ab212790546 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -86,7 +86,7 @@ bool Isv2x16VT(EVT VT); namespace NVPTX { -inline char const *toCString(Ordering Order) { +inline std::string_view OrderingToCString(Ordering Order) { switch (Order) { case Ordering::NotAtomic: return "NotAtomic"; @@ -108,7 +108,7 @@ inline char const *toCString(Ordering Order) { } inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { - O << toCString(Order); + O << OrderingToCString(Order); return O; } From 873838570291bd7b253ea667a04f835dec71f0b4 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 1 Aug 2024 13:50:37 -0700 Subject: [PATCH 21/22] [NVPTX] Switch to std::string --- llvm/lib/Target/NVPTX/NVPTX.h | 3 ++- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 21 +++++++++------------ llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 1b9bf1e0616ca..939cb2afe0a57 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -127,7 +127,8 @@ static_assert(Ordering::NotAtomic == (unsigned)AtomicOrdering::NotAtomic); static_assert(Ordering::Relaxed == (unsigned)AtomicOrdering::Monotonic); static_assert(Ordering::Acquire == (unsigned)AtomicOrdering::Acquire); static_assert(Ordering::Release == (unsigned)AtomicOrdering::Release); -static_assert(Ordering::SequentiallyConsistent == (unsigned)AtomicOrdering::SequentiallyConsistent); +static_assert(Ordering::SequentiallyConsistent == + (unsigned)AtomicOrdering::SequentiallyConsistent); namespace PTXLdStInstCode { enum AddressSpace { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index d2dc37fc2a5e6..25c198f0121e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -978,8 +978,9 @@ NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, break; } default: - report_fatal_error(formatv("Unexpected fence ordering: \"{}\".", - OrderingToCString(NVPTX::Ordering(FenceOrdering)))); + report_fatal_error( + formatv("Unexpected fence ordering: \"{}\".", + OrderingToCString(NVPTX::Ordering(FenceOrdering)))); } return InstructionOrdering; @@ -2026,7 +2027,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { break; case NVPTXISD::StoreV4: VecType = NVPTX::PTXLdStInstCode::V4; - Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4)}); + Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(4)}); N2 = N->getOperand(5); break; default: @@ -2043,12 +2045,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { ToTypeWidth = 32; } - Ops.append({ - getI32Imm(InstructionOrdering, DL), - getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(ToType, DL), - getI32Imm(ToTypeWidth, DL)}); + Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL), + getI32Imm(VecType, DL), getI32Imm(ToType, DL), + getI32Imm(ToTypeWidth, DL)}); if (SelectDirectAddr(N2, Addr)) { switch (N->getOpcode()) { @@ -2461,9 +2460,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(N->getOperand(i + 3)); Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), - Chain, - Glue}); + CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); // Determine target opcode // If we have an i1, use an 8-bit store. The lowering code in diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 23ab212790546..b3b0147b76929 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -86,7 +86,7 @@ bool Isv2x16VT(EVT VT); namespace NVPTX { -inline std::string_view OrderingToCString(Ordering Order) { +inline std::string OrderingToCString(Ordering Order) { switch (Order) { case Ordering::NotAtomic: return "NotAtomic"; From ba80e0f4296c0e347aa97b733bb68372026ab6d8 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 6 Aug 2024 05:02:50 -0700 Subject: [PATCH 22/22] [NVPTX] Ordering uses AtomicOrdering values directly --- llvm/lib/Target/NVPTX/NVPTX.h | 23 +++++++++-------------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 939cb2afe0a57..f6f6acb9e13c9 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -110,25 +110,20 @@ enum LoadStore { // Extends LLVM AtomicOrdering with PTX Orderings: using OrderingUnderlyingType = unsigned int; enum Ordering : OrderingUnderlyingType { - NotAtomic = 0, // PTX calls these: "Weak" + NotAtomic = (OrderingUnderlyingType) + AtomicOrdering::NotAtomic, // PTX calls these: "Weak" // Unordered = 1, // NVPTX maps LLVM Unorderd to Relaxed - Relaxed = 2, + Relaxed = (OrderingUnderlyingType)AtomicOrdering::Monotonic, // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire" - Acquire = 4, - Release = 5, + Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire, + Release = (OrderingUnderlyingType)AtomicOrdering::Release, // AcquireRelease = 6, // TODO - SequentiallyConsistent = 7, - Volatile = 8, - RelaxedMMIO = 9, + SequentiallyConsistent = + (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent, + Volatile = SequentiallyConsistent + 1, + RelaxedMMIO = Volatile + 1, LAST = RelaxedMMIO }; -// Values match LLVM AtomicOrdering for common orderings: -static_assert(Ordering::NotAtomic == (unsigned)AtomicOrdering::NotAtomic); -static_assert(Ordering::Relaxed == (unsigned)AtomicOrdering::Monotonic); -static_assert(Ordering::Acquire == (unsigned)AtomicOrdering::Acquire); -static_assert(Ordering::Release == (unsigned)AtomicOrdering::Release); -static_assert(Ordering::SequentiallyConsistent == - (unsigned)AtomicOrdering::SequentiallyConsistent); namespace PTXLdStInstCode { enum AddressSpace { diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index b3b0147b76929..eebd91fefe4f0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -113,6 +113,6 @@ inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) { } } // namespace NVPTX -} +} // namespace llvm #endif