diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 9f16cf5d5bc38..47127a6c29603 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1347,13 +1347,19 @@ ScheduleDAGMILive *createGenericSchedLive(MachineSchedContext *C); /// Create a generic scheduler with no vreg liveness or DAG mutation passes. ScheduleDAGMI *createGenericSchedPostRA(MachineSchedContext *C); +/// If ReorderWhileClustering is set to true, no attempt will be made to +/// reduce reordering due to store clustering. std::unique_ptr createLoadClusterDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI); + const TargetRegisterInfo *TRI, + bool ReorderWhileClustering = false); +/// If ReorderWhileClustering is set to true, no attempt will be made to +/// reduce reordering due to store clustering. std::unique_ptr createStoreClusterDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI); + const TargetRegisterInfo *TRI, + bool ReorderWhileClustering = false); std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 886137d86f87d..554776783eff6 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1743,11 +1743,14 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; bool IsLoad; + bool ReorderWhileClustering; public: BaseMemOpClusterMutation(const TargetInstrInfo *tii, - const TargetRegisterInfo *tri, bool IsLoad) - : TII(tii), TRI(tri), IsLoad(IsLoad) {} + const TargetRegisterInfo *tri, bool IsLoad, + bool ReorderWhileClustering) + : TII(tii), TRI(tri), IsLoad(IsLoad), + ReorderWhileClustering(ReorderWhileClustering) {} void apply(ScheduleDAGInstrs *DAGInstrs) override; @@ -1763,14 +1766,16 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { class StoreClusterMutation : public BaseMemOpClusterMutation { public: StoreClusterMutation(const TargetInstrInfo *tii, - const TargetRegisterInfo *tri) - : BaseMemOpClusterMutation(tii, tri, false) {} + const TargetRegisterInfo *tri, + bool ReorderWhileClustering) + : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {} }; class LoadClusterMutation : public BaseMemOpClusterMutation { public: - LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) - : BaseMemOpClusterMutation(tii, tri, true) {} + LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri, + bool ReorderWhileClustering) + : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {} }; } // end anonymous namespace @@ -1779,15 +1784,19 @@ namespace llvm { std::unique_ptr createLoadClusterDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? std::make_unique(TII, TRI) + const TargetRegisterInfo *TRI, + bool ReorderWhileClustering) { + return EnableMemOpCluster ? std::make_unique( + TII, TRI, ReorderWhileClustering) : nullptr; } std::unique_ptr createStoreClusterDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? std::make_unique(TII, TRI) + const TargetRegisterInfo *TRI, + bool ReorderWhileClustering) { + return EnableMemOpCluster ? std::make_unique( + TII, TRI, ReorderWhileClustering) : nullptr; } @@ -1840,7 +1849,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; - if (SUa->NodeNum > SUb->NodeNum) + if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); // FIXME: Is this check really required? diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 1dcff7eb563e2..0aab5e9376c1d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2194,6 +2194,71 @@ MachineInstr *RISCVInstrInfo::emitLdStWithAddr(MachineInstr &MemI, .setMIFlags(MemI.getFlags()); } +bool RISCVInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const { + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + + auto IsLoadOpcode = [&](unsigned Opcode) { + switch (Opcode) { + case RISCV::LB: + case RISCV::LBU: + case RISCV::LH: + case RISCV::LHU: + case RISCV::FLH: + case RISCV::LW: + case RISCV::LWU: + case RISCV::FLW: + case RISCV::LD: + case RISCV::FLD: + return true; + default: + return false; + } + }; + + if (!IsLoadOpcode(Load1->getMachineOpcode()) || + !IsLoadOpcode(Load2->getMachineOpcode())) + return false; + + // Check if base address and chain operands match. + if (Load1->getOperand(0) != Load2->getOperand(0)) + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(2) != Load2->getOperand(2)) + return false; + + // Determine the offsets. + if (isa(Load1->getOperand(1)) && + isa(Load2->getOperand(1))) { + Offset1 = cast(Load1->getOperand(1))->getSExtValue(); + Offset2 = cast(Load2->getOperand(1))->getSExtValue(); + return true; + } + + return false; +} + +bool RISCVInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1); + + if ((Offset2 - Offset1) / 8 > 64) + return false; + + // Check if the machine opcodes are different. If they are different + // then we consider them to not be of the same base address, + if ((Load1->getMachineOpcode() != Load2->getMachineOpcode())) + return false; // FIXME: overly conservative? + + // Four loads in a row should be sufficient. + if (NumLoads >= 3) + return false; + + return true; +} + bool RISCVInstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -2282,9 +2347,13 @@ bool RISCVInstrInfo::shouldClusterMemOps( return false; } - // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets - // indicate they likely share a cache line. - return ClusterSize <= 4; + unsigned CacheLineSize = + BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize(); + // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget. + CacheLineSize = CacheLineSize ? CacheLineSize : 64; + // Cluster if the memory operations are on the same or a neighbouring cache + // line. + return std::abs(Offset1 - Offset2) < CacheLineSize; } // Set BaseReg (the base register operand), Offset (the byte offset being diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 7e1d3f3118065..4a30f3a396531 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -157,6 +157,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, const TargetRegisterInfo *TRI) const override; + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, + int64_t &Offset2) const override; + bool shouldClusterMemOps(ArrayRef BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef BaseOps2, @@ -164,6 +167,10 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { unsigned ClusterSize, unsigned NumBytes) const override; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, + int64_t Offset2, + unsigned NumLoads) const override; + bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 3abdb6003659f..7b0fccd997d30 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -95,11 +95,6 @@ static cl::opt cl::desc("Enable Split RegisterAlloc for RVV"), cl::init(true)); -static cl::opt EnableMISchedLoadClustering( - "riscv-misched-load-clustering", cl::Hidden, - cl::desc("Enable load clustering in the machine scheduler"), - cl::init(false)); - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -345,15 +340,10 @@ class RISCVPassConfig : public TargetPassConfig { ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { const RISCVSubtarget &ST = C->MF->getSubtarget(); - ScheduleDAGMILive *DAG = nullptr; - if (EnableMISchedLoadClustering) { - DAG = createGenericSchedLive(C); - DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - } - if (ST.hasMacroFusion()) { - DAG = DAG ? DAG : createGenericSchedLive(C); + ScheduleDAGMILive *DAG = createGenericSchedLive(C); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true)); + if (ST.hasMacroFusion()) DAG->addMutation(createRISCVMacroFusionDAGMutation()); - } return DAG; } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll index 501a3c0ce7438..b77d6bb1fac8f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll @@ -63,15 +63,15 @@ define i32 @va1(ptr %fmt, ...) { ; RV64-NEXT: sd a2, 32(sp) ; RV64-NEXT: sd a3, 40(sp) ; RV64-NEXT: sd a4, 48(sp) -; RV64-NEXT: sd a5, 56(sp) ; RV64-NEXT: addi a0, sp, 24 ; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: lw a0, 12(sp) -; RV64-NEXT: lwu a1, 8(sp) +; RV64-NEXT: lwu a0, 8(sp) +; RV64-NEXT: lw a1, 12(sp) +; RV64-NEXT: sd a5, 56(sp) ; RV64-NEXT: sd a6, 64(sp) ; RV64-NEXT: sd a7, 72(sp) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: srli a2, a1, 32 ; RV64-NEXT: sw a1, 8(sp) @@ -968,22 +968,22 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: sd a4, 304(a0) ; RV64-NEXT: lui a0, 24414 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: sd a5, 312(a0) -; RV64-NEXT: lui a0, 24414 ; RV64-NEXT: addiw a0, a0, 280 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: lw a0, 12(sp) -; RV64-NEXT: lwu a1, 8(sp) +; RV64-NEXT: lwu a0, 8(sp) +; RV64-NEXT: lw a1, 12(sp) +; RV64-NEXT: lui a2, 24414 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: sd a5, 312(a2) ; RV64-NEXT: lui a2, 24414 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: sd a6, 320(a2) ; RV64-NEXT: lui a2, 24414 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: sd a7, 328(a2) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: srli a2, a1, 32 ; RV64-NEXT: sw a1, 8(sp) diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll index 274f1cef49aa9..823918f1c42e7 100644 --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -167,17 +167,17 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw a1, 8(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: srli a5, a2, 29 ; RV32I-NEXT: slli a6, a3, 3 ; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: srli a3, a3, 29 -; RV32I-NEXT: slli a6, a1, 3 +; RV32I-NEXT: slli a6, a4, 3 ; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: srli a1, a1, 29 -; RV32I-NEXT: slli a4, a4, 3 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: srli a4, a4, 29 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a2, a2, 3 ; RV32I-NEXT: lui a4, 128 ; RV32I-NEXT: add a1, a1, a4 @@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; ; RV32C-LABEL: add_wide_operand: ; RV32C: # %bb.0: -; RV32C-NEXT: lw a6, 4(a1) -; RV32C-NEXT: c.lw a3, 12(a1) -; RV32C-NEXT: c.lw a4, 0(a1) +; RV32C-NEXT: c.lw a2, 12(a1) +; RV32C-NEXT: lw a6, 0(a1) +; RV32C-NEXT: c.lw a3, 4(a1) ; RV32C-NEXT: c.lw a1, 8(a1) ; RV32C-NEXT: c.lui a5, 16 -; RV32C-NEXT: c.add a3, a5 -; RV32C-NEXT: c.slli a3, 3 +; RV32C-NEXT: c.add a2, a5 +; RV32C-NEXT: c.slli a2, 3 ; RV32C-NEXT: srli a5, a1, 29 -; RV32C-NEXT: c.or a3, a5 -; RV32C-NEXT: srli a5, a4, 29 -; RV32C-NEXT: slli a2, a6, 3 ; RV32C-NEXT: c.or a2, a5 ; RV32C-NEXT: srli a5, a6, 29 +; RV32C-NEXT: slli a4, a3, 3 +; RV32C-NEXT: c.or a4, a5 +; RV32C-NEXT: c.srli a3, 29 ; RV32C-NEXT: c.slli a1, 3 -; RV32C-NEXT: c.or a1, a5 -; RV32C-NEXT: c.slli a4, 3 -; RV32C-NEXT: c.sw a4, 0(a0) +; RV32C-NEXT: c.or a1, a3 +; RV32C-NEXT: c.slli a6, 3 +; RV32C-NEXT: sw a6, 0(a0) ; RV32C-NEXT: c.sw a1, 8(a0) -; RV32C-NEXT: c.sw a2, 4(a0) -; RV32C-NEXT: c.sw a3, 12(a0) +; RV32C-NEXT: c.sw a4, 4(a0) +; RV32C-NEXT: c.sw a2, 12(a0) ; RV32C-NEXT: c.jr ra ; ; RV64C-LABEL: add_wide_operand: diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll index 895852b84e004..c9b1a00968ab9 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll @@ -192,37 +192,37 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 4(a0) -; RV32-NEXT: lw a5, 0(a0) +; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a5, 4(a0) ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB11_2 ; RV32-NEXT: .LBB11_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: sw a5, 8(sp) -; RV32-NEXT: sw a4, 12(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: lw a5, 8(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB11_6 ; RV32-NEXT: .LBB11_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a4, s1, .LBB11_4 +; RV32-NEXT: beq a5, s1, .LBB11_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: slt a0, s1, a4 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: slt a0, s1, a5 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB11_1 ; RV32-NEXT: j .LBB11_5 ; RV32-NEXT: .LBB11_4: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a5 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s2, a4 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB11_1 ; RV32-NEXT: .LBB11_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 @@ -268,37 +268,37 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 4(a0) -; RV32-NEXT: lw a5, 0(a0) +; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a5, 4(a0) ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB13_2 ; RV32-NEXT: .LBB13_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sw a5, 8(sp) -; RV32-NEXT: sw a4, 12(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: lw a5, 8(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB13_6 ; RV32-NEXT: .LBB13_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a4, s1, .LBB13_4 +; RV32-NEXT: beq a5, s1, .LBB13_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sltu a0, s1, a4 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s1, a5 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB13_1 ; RV32-NEXT: j .LBB13_5 ; RV32-NEXT: .LBB13_4: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a5 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s2, a4 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: bnez a0, .LBB13_1 ; RV32-NEXT: .LBB13_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 @@ -344,37 +344,37 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 4(a0) -; RV32-NEXT: lw a5, 0(a0) +; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a5, 4(a0) ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB15_2 ; RV32-NEXT: .LBB15_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: sw a5, 8(sp) -; RV32-NEXT: sw a4, 12(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: lw a5, 8(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB15_6 ; RV32-NEXT: .LBB15_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a4, s1, .LBB15_4 +; RV32-NEXT: beq a5, s1, .LBB15_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: slt a0, s1, a4 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: slt a0, s1, a5 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB15_1 ; RV32-NEXT: j .LBB15_5 ; RV32-NEXT: .LBB15_4: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a5 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s2, a4 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB15_1 ; RV32-NEXT: .LBB15_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 @@ -420,37 +420,37 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 4(a0) -; RV32-NEXT: lw a5, 0(a0) +; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a5, 4(a0) ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s2, a1 ; RV32-NEXT: j .LBB17_2 ; RV32-NEXT: .LBB17_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sw a5, 8(sp) -; RV32-NEXT: sw a4, 12(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a4, 12(sp) -; RV32-NEXT: lw a5, 8(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a5, 12(sp) ; RV32-NEXT: bnez a0, .LBB17_6 ; RV32-NEXT: .LBB17_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a4, s1, .LBB17_4 +; RV32-NEXT: beq a5, s1, .LBB17_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sltu a0, s1, a4 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s1, a5 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB17_1 ; RV32-NEXT: j .LBB17_5 ; RV32-NEXT: .LBB17_4: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a5 -; RV32-NEXT: mv a2, a5 -; RV32-NEXT: mv a3, a4 +; RV32-NEXT: sltu a0, s2, a4 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: mv a3, a5 ; RV32-NEXT: beqz a0, .LBB17_1 ; RV32-NEXT: .LBB17_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index e97a1ea5dfca0..512ee2eab2d31 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -19066,36 +19066,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB220_2 ; RV32I-NEXT: .LBB220_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB220_7 ; RV32I-NEXT: .LBB220_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB220_4 +; RV32I-NEXT: beq a4, s1, .LBB220_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB220_5 ; RV32I-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB220_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB220_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 @@ -19103,8 +19103,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB220_1 ; RV32I-NEXT: .LBB220_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19120,36 +19120,36 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB220_2 ; RV32IA-NEXT: .LBB220_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB220_7 ; RV32IA-NEXT: .LBB220_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB220_4 +; RV32IA-NEXT: beq a4, s1, .LBB220_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB220_5 ; RV32IA-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB220_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB220_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 @@ -19157,8 +19157,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB220_1 ; RV32IA-NEXT: .LBB220_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19219,36 +19219,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB221_2 ; RV32I-NEXT: .LBB221_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB221_7 ; RV32I-NEXT: .LBB221_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB221_4 +; RV32I-NEXT: beq a4, s1, .LBB221_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB221_5 ; RV32I-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB221_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB221_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 @@ -19256,8 +19256,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB221_1 ; RV32I-NEXT: .LBB221_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19273,36 +19273,36 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB221_2 ; RV32IA-NEXT: .LBB221_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB221_7 ; RV32IA-NEXT: .LBB221_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB221_4 +; RV32IA-NEXT: beq a4, s1, .LBB221_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB221_5 ; RV32IA-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB221_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB221_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 @@ -19310,8 +19310,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB221_1 ; RV32IA-NEXT: .LBB221_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19377,36 +19377,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB222_2 ; RV32I-NEXT: .LBB222_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB222_7 ; RV32I-NEXT: .LBB222_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB222_4 +; RV32I-NEXT: beq a4, s1, .LBB222_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB222_5 ; RV32I-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB222_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB222_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 @@ -19414,8 +19414,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB222_1 ; RV32I-NEXT: .LBB222_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19431,36 +19431,36 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB222_2 ; RV32IA-NEXT: .LBB222_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB222_7 ; RV32IA-NEXT: .LBB222_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB222_4 +; RV32IA-NEXT: beq a4, s1, .LBB222_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB222_5 ; RV32IA-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB222_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB222_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 @@ -19468,8 +19468,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB222_1 ; RV32IA-NEXT: .LBB222_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19535,36 +19535,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB223_2 ; RV32I-NEXT: .LBB223_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB223_7 ; RV32I-NEXT: .LBB223_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB223_4 +; RV32I-NEXT: beq a4, s1, .LBB223_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB223_5 ; RV32I-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB223_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB223_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 @@ -19572,8 +19572,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB223_1 ; RV32I-NEXT: .LBB223_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19589,36 +19589,36 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB223_2 ; RV32IA-NEXT: .LBB223_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB223_7 ; RV32IA-NEXT: .LBB223_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB223_4 +; RV32IA-NEXT: beq a4, s1, .LBB223_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB223_5 ; RV32IA-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB223_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB223_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 @@ -19626,8 +19626,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB223_1 ; RV32IA-NEXT: .LBB223_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19693,36 +19693,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB224_2 ; RV32I-NEXT: .LBB224_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB224_7 ; RV32I-NEXT: .LBB224_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB224_4 +; RV32I-NEXT: beq a4, s1, .LBB224_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB224_5 ; RV32I-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB224_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB224_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 @@ -19730,8 +19730,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB224_1 ; RV32I-NEXT: .LBB224_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19747,36 +19747,36 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB224_2 ; RV32IA-NEXT: .LBB224_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB224_7 ; RV32IA-NEXT: .LBB224_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB224_4 +; RV32IA-NEXT: beq a4, s1, .LBB224_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB224_5 ; RV32IA-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB224_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB224_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 @@ -19784,8 +19784,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB224_1 ; RV32IA-NEXT: .LBB224_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19851,36 +19851,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB225_2 ; RV32I-NEXT: .LBB225_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB225_7 ; RV32I-NEXT: .LBB225_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB225_4 +; RV32I-NEXT: beq a4, s1, .LBB225_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB225_5 ; RV32I-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB225_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB225_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 @@ -19888,8 +19888,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB225_1 ; RV32I-NEXT: .LBB225_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19905,36 +19905,36 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB225_2 ; RV32IA-NEXT: .LBB225_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB225_7 ; RV32IA-NEXT: .LBB225_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB225_4 +; RV32IA-NEXT: beq a4, s1, .LBB225_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB225_5 ; RV32IA-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB225_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB225_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 @@ -19942,8 +19942,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB225_1 ; RV32IA-NEXT: .LBB225_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20004,36 +20004,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB226_2 ; RV32I-NEXT: .LBB226_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB226_7 ; RV32I-NEXT: .LBB226_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB226_4 +; RV32I-NEXT: beq a4, s1, .LBB226_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB226_5 ; RV32I-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB226_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB226_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 @@ -20041,8 +20041,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB226_1 ; RV32I-NEXT: .LBB226_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20058,36 +20058,36 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB226_2 ; RV32IA-NEXT: .LBB226_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB226_7 ; RV32IA-NEXT: .LBB226_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB226_4 +; RV32IA-NEXT: beq a4, s1, .LBB226_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB226_5 ; RV32IA-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB226_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB226_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 @@ -20095,8 +20095,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB226_1 ; RV32IA-NEXT: .LBB226_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20162,36 +20162,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB227_2 ; RV32I-NEXT: .LBB227_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB227_7 ; RV32I-NEXT: .LBB227_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB227_4 +; RV32I-NEXT: beq a4, s1, .LBB227_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB227_5 ; RV32I-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB227_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB227_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 @@ -20199,8 +20199,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB227_1 ; RV32I-NEXT: .LBB227_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20216,36 +20216,36 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB227_2 ; RV32IA-NEXT: .LBB227_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB227_7 ; RV32IA-NEXT: .LBB227_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB227_4 +; RV32IA-NEXT: beq a4, s1, .LBB227_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB227_5 ; RV32IA-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB227_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB227_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 @@ -20253,8 +20253,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB227_1 ; RV32IA-NEXT: .LBB227_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20320,36 +20320,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB228_2 ; RV32I-NEXT: .LBB228_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB228_7 ; RV32I-NEXT: .LBB228_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB228_4 +; RV32I-NEXT: beq a4, s1, .LBB228_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB228_5 ; RV32I-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB228_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB228_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 @@ -20357,8 +20357,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB228_1 ; RV32I-NEXT: .LBB228_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20374,36 +20374,36 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB228_2 ; RV32IA-NEXT: .LBB228_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB228_7 ; RV32IA-NEXT: .LBB228_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB228_4 +; RV32IA-NEXT: beq a4, s1, .LBB228_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB228_5 ; RV32IA-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB228_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB228_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 @@ -20411,8 +20411,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB228_1 ; RV32IA-NEXT: .LBB228_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20478,36 +20478,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB229_2 ; RV32I-NEXT: .LBB229_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB229_7 ; RV32I-NEXT: .LBB229_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB229_4 +; RV32I-NEXT: beq a4, s1, .LBB229_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB229_5 ; RV32I-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB229_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB229_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 @@ -20515,8 +20515,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB229_1 ; RV32I-NEXT: .LBB229_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20532,36 +20532,36 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB229_2 ; RV32IA-NEXT: .LBB229_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB229_7 ; RV32IA-NEXT: .LBB229_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB229_4 +; RV32IA-NEXT: beq a4, s1, .LBB229_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB229_5 ; RV32IA-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB229_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB229_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 @@ -20569,8 +20569,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB229_1 ; RV32IA-NEXT: .LBB229_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20636,36 +20636,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB230_2 ; RV32I-NEXT: .LBB230_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB230_7 ; RV32I-NEXT: .LBB230_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB230_4 +; RV32I-NEXT: beq a4, s1, .LBB230_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB230_5 ; RV32I-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB230_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB230_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 @@ -20673,8 +20673,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB230_1 ; RV32I-NEXT: .LBB230_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20690,36 +20690,36 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB230_2 ; RV32IA-NEXT: .LBB230_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB230_7 ; RV32IA-NEXT: .LBB230_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB230_4 +; RV32IA-NEXT: beq a4, s1, .LBB230_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB230_5 ; RV32IA-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB230_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB230_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 @@ -20727,8 +20727,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB230_1 ; RV32IA-NEXT: .LBB230_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20789,36 +20789,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB231_2 ; RV32I-NEXT: .LBB231_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB231_7 ; RV32I-NEXT: .LBB231_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB231_4 +; RV32I-NEXT: beq a4, s1, .LBB231_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB231_5 ; RV32I-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB231_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB231_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 @@ -20826,8 +20826,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB231_1 ; RV32I-NEXT: .LBB231_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20843,36 +20843,36 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB231_2 ; RV32IA-NEXT: .LBB231_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB231_7 ; RV32IA-NEXT: .LBB231_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB231_4 +; RV32IA-NEXT: beq a4, s1, .LBB231_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB231_5 ; RV32IA-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB231_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB231_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 @@ -20880,8 +20880,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB231_1 ; RV32IA-NEXT: .LBB231_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -20947,36 +20947,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB232_2 ; RV32I-NEXT: .LBB232_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB232_7 ; RV32I-NEXT: .LBB232_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB232_4 +; RV32I-NEXT: beq a4, s1, .LBB232_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB232_5 ; RV32I-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB232_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB232_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 @@ -20984,8 +20984,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB232_1 ; RV32I-NEXT: .LBB232_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21001,36 +21001,36 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB232_2 ; RV32IA-NEXT: .LBB232_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB232_7 ; RV32IA-NEXT: .LBB232_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB232_4 +; RV32IA-NEXT: beq a4, s1, .LBB232_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB232_5 ; RV32IA-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB232_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB232_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 @@ -21038,8 +21038,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB232_1 ; RV32IA-NEXT: .LBB232_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21105,36 +21105,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB233_2 ; RV32I-NEXT: .LBB233_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB233_7 ; RV32I-NEXT: .LBB233_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB233_4 +; RV32I-NEXT: beq a4, s1, .LBB233_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB233_5 ; RV32I-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB233_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB233_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 @@ -21142,8 +21142,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB233_1 ; RV32I-NEXT: .LBB233_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21159,36 +21159,36 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB233_2 ; RV32IA-NEXT: .LBB233_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB233_7 ; RV32IA-NEXT: .LBB233_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB233_4 +; RV32IA-NEXT: beq a4, s1, .LBB233_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB233_5 ; RV32IA-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB233_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB233_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 @@ -21196,8 +21196,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB233_1 ; RV32IA-NEXT: .LBB233_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21263,36 +21263,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB234_2 ; RV32I-NEXT: .LBB234_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB234_7 ; RV32I-NEXT: .LBB234_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB234_4 +; RV32I-NEXT: beq a4, s1, .LBB234_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB234_5 ; RV32I-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB234_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB234_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 @@ -21300,8 +21300,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB234_1 ; RV32I-NEXT: .LBB234_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21317,36 +21317,36 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB234_2 ; RV32IA-NEXT: .LBB234_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB234_7 ; RV32IA-NEXT: .LBB234_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB234_4 +; RV32IA-NEXT: beq a4, s1, .LBB234_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB234_5 ; RV32IA-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB234_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB234_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 @@ -21354,8 +21354,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB234_1 ; RV32IA-NEXT: .LBB234_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21421,36 +21421,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB235_2 ; RV32I-NEXT: .LBB235_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB235_7 ; RV32I-NEXT: .LBB235_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB235_4 +; RV32I-NEXT: beq a4, s1, .LBB235_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB235_5 ; RV32I-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB235_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB235_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 @@ -21458,8 +21458,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB235_1 ; RV32I-NEXT: .LBB235_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21475,36 +21475,36 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB235_2 ; RV32IA-NEXT: .LBB235_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB235_7 ; RV32IA-NEXT: .LBB235_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB235_4 +; RV32IA-NEXT: beq a4, s1, .LBB235_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB235_5 ; RV32IA-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB235_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB235_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 @@ -21512,8 +21512,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB235_1 ; RV32IA-NEXT: .LBB235_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21574,36 +21574,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB236_2 ; RV32I-NEXT: .LBB236_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB236_7 ; RV32I-NEXT: .LBB236_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB236_4 +; RV32I-NEXT: beq a4, s1, .LBB236_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB236_5 ; RV32I-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB236_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB236_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 @@ -21611,8 +21611,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB236_1 ; RV32I-NEXT: .LBB236_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21628,36 +21628,36 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB236_2 ; RV32IA-NEXT: .LBB236_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB236_7 ; RV32IA-NEXT: .LBB236_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB236_4 +; RV32IA-NEXT: beq a4, s1, .LBB236_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB236_5 ; RV32IA-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB236_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB236_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 @@ -21665,8 +21665,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB236_1 ; RV32IA-NEXT: .LBB236_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21732,36 +21732,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB237_2 ; RV32I-NEXT: .LBB237_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB237_7 ; RV32I-NEXT: .LBB237_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB237_4 +; RV32I-NEXT: beq a4, s1, .LBB237_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB237_5 ; RV32I-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB237_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB237_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 @@ -21769,8 +21769,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB237_1 ; RV32I-NEXT: .LBB237_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21786,36 +21786,36 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB237_2 ; RV32IA-NEXT: .LBB237_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB237_7 ; RV32IA-NEXT: .LBB237_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB237_4 +; RV32IA-NEXT: beq a4, s1, .LBB237_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB237_5 ; RV32IA-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB237_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB237_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 @@ -21823,8 +21823,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB237_1 ; RV32IA-NEXT: .LBB237_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21890,36 +21890,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB238_2 ; RV32I-NEXT: .LBB238_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB238_7 ; RV32I-NEXT: .LBB238_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB238_4 +; RV32I-NEXT: beq a4, s1, .LBB238_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB238_5 ; RV32I-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB238_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB238_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 @@ -21927,8 +21927,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB238_1 ; RV32I-NEXT: .LBB238_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -21944,36 +21944,36 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB238_2 ; RV32IA-NEXT: .LBB238_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB238_7 ; RV32IA-NEXT: .LBB238_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB238_4 +; RV32IA-NEXT: beq a4, s1, .LBB238_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB238_5 ; RV32IA-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB238_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB238_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 @@ -21981,8 +21981,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB238_1 ; RV32IA-NEXT: .LBB238_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -22048,36 +22048,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB239_2 ; RV32I-NEXT: .LBB239_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB239_7 ; RV32I-NEXT: .LBB239_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB239_4 +; RV32I-NEXT: beq a4, s1, .LBB239_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB239_5 ; RV32I-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB239_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB239_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 @@ -22085,8 +22085,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB239_1 ; RV32I-NEXT: .LBB239_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -22102,36 +22102,36 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB239_2 ; RV32IA-NEXT: .LBB239_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB239_7 ; RV32IA-NEXT: .LBB239_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB239_4 +; RV32IA-NEXT: beq a4, s1, .LBB239_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB239_5 ; RV32IA-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB239_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB239_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 @@ -22139,8 +22139,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB239_1 ; RV32IA-NEXT: .LBB239_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index 2739fde250ee2..793ca7d08f513 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -3137,36 +3137,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB43_2 ; RV32I-NEXT: .LBB43_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB43_7 ; RV32I-NEXT: .LBB43_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB43_4 +; RV32I-NEXT: beq a4, s1, .LBB43_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB43_5 ; RV32I-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB43_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB43_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 @@ -3174,8 +3174,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB43_1 ; RV32I-NEXT: .LBB43_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3191,36 +3191,36 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB43_2 ; RV32IA-NEXT: .LBB43_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB43_7 ; RV32IA-NEXT: .LBB43_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB43_4 +; RV32IA-NEXT: beq a4, s1, .LBB43_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB43_5 ; RV32IA-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB43_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB43_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 @@ -3228,8 +3228,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB43_1 ; RV32IA-NEXT: .LBB43_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3290,36 +3290,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB44_2 ; RV32I-NEXT: .LBB44_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB44_7 ; RV32I-NEXT: .LBB44_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB44_4 +; RV32I-NEXT: beq a4, s1, .LBB44_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: slt a0, s1, a5 +; RV32I-NEXT: slt a0, s1, a4 ; RV32I-NEXT: j .LBB44_5 ; RV32I-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB44_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB44_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 @@ -3327,8 +3327,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB44_1 ; RV32I-NEXT: .LBB44_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3344,36 +3344,36 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB44_2 ; RV32IA-NEXT: .LBB44_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB44_7 ; RV32IA-NEXT: .LBB44_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB44_4 +; RV32IA-NEXT: beq a4, s1, .LBB44_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: slt a0, s1, a5 +; RV32IA-NEXT: slt a0, s1, a4 ; RV32IA-NEXT: j .LBB44_5 ; RV32IA-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB44_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB44_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 @@ -3381,8 +3381,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB44_1 ; RV32IA-NEXT: .LBB44_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3443,36 +3443,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB45_2 ; RV32I-NEXT: .LBB45_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB45_7 ; RV32I-NEXT: .LBB45_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB45_4 +; RV32I-NEXT: beq a4, s1, .LBB45_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB45_5 ; RV32I-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB45_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: bnez a0, .LBB45_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 @@ -3480,8 +3480,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB45_1 ; RV32I-NEXT: .LBB45_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3497,36 +3497,36 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB45_2 ; RV32IA-NEXT: .LBB45_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB45_7 ; RV32IA-NEXT: .LBB45_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB45_4 +; RV32IA-NEXT: beq a4, s1, .LBB45_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB45_5 ; RV32IA-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB45_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: bnez a0, .LBB45_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 @@ -3534,8 +3534,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB45_1 ; RV32IA-NEXT: .LBB45_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3596,36 +3596,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB46_2 ; RV32I-NEXT: .LBB46_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB46_7 ; RV32I-NEXT: .LBB46_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB46_4 +; RV32I-NEXT: beq a4, s1, .LBB46_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB46_5 ; RV32I-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB46_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a2, a5 +; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: beqz a0, .LBB46_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 @@ -3633,8 +3633,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: mv a3, s1 ; RV32I-NEXT: j .LBB46_1 ; RV32I-NEXT: .LBB46_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3650,36 +3650,36 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB46_2 ; RV32IA-NEXT: .LBB46_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB46_7 ; RV32IA-NEXT: .LBB46_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB46_4 +; RV32IA-NEXT: beq a4, s1, .LBB46_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB46_5 ; RV32IA-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB46_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 +; RV32IA-NEXT: mv a2, a5 +; RV32IA-NEXT: mv a3, a4 ; RV32IA-NEXT: beqz a0, .LBB46_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 @@ -3687,8 +3687,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: mv a3, s1 ; RV32IA-NEXT: j .LBB46_1 ; RV32IA-NEXT: .LBB46_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 5f15a9c067102..2fa610e7548b8 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -468,41 +468,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a5, s1 +; RV32I-NEXT: sltu a0, a4, s1 ; RV32I-NEXT: .LBB3_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: addi a1, a4, 1 +; RV32I-NEXT: addi a1, a5, 1 ; RV32I-NEXT: seqz a2, a1 -; RV32I-NEXT: add a3, a5, a2 +; RV32I-NEXT: add a3, a4, a2 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: and a2, a0, a1 ; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB3_5 ; RV32I-NEXT: .LBB3_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: bne a5, s1, .LBB3_1 +; RV32I-NEXT: bne a4, s1, .LBB3_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a4, s2 +; RV32I-NEXT: sltu a0, a5, s2 ; RV32I-NEXT: j .LBB3_2 ; RV32I-NEXT: .LBB3_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -523,41 +523,41 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB3_3 ; RV32IA-NEXT: .LBB3_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a5, s1 +; RV32IA-NEXT: sltu a0, a4, s1 ; RV32IA-NEXT: .LBB3_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: addi a1, a4, 1 +; RV32IA-NEXT: addi a1, a5, 1 ; RV32IA-NEXT: seqz a2, a1 -; RV32IA-NEXT: add a3, a5, a2 +; RV32IA-NEXT: add a3, a4, a2 ; RV32IA-NEXT: neg a0, a0 ; RV32IA-NEXT: and a2, a0, a1 ; RV32IA-NEXT: and a3, a0, a3 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB3_5 ; RV32IA-NEXT: .LBB3_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: bne a5, s1, .LBB3_1 +; RV32IA-NEXT: bne a4, s1, .LBB3_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a4, s2 +; RV32IA-NEXT: sltu a0, a5, s2 ; RV32IA-NEXT: j .LBB3_2 ; RV32IA-NEXT: .LBB3_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1211,35 +1211,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 0(a0) +; RV32I-NEXT: lw a4, 4(a0) ; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: j .LBB7_2 ; RV32I-NEXT: .LBB7_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_8@plt -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: lw a4, 8(sp) +; RV32I-NEXT: lw a5, 8(sp) +; RV32I-NEXT: lw a4, 12(sp) ; RV32I-NEXT: bnez a0, .LBB7_7 ; RV32I-NEXT: .LBB7_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s1, .LBB7_4 +; RV32I-NEXT: beq a4, s1, .LBB7_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sltu a0, s1, a5 +; RV32I-NEXT: sltu a0, s1, a4 ; RV32I-NEXT: j .LBB7_5 ; RV32I-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a0, s2, a5 ; RV32I-NEXT: .LBB7_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: or a1, a4, a5 +; RV32I-NEXT: or a1, a5, a4 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: mv a2, s2 @@ -1247,13 +1247,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: bnez a0, .LBB7_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: seqz a0, a4 -; RV32I-NEXT: sub a3, a5, a0 -; RV32I-NEXT: addi a2, a4, -1 +; RV32I-NEXT: seqz a0, a5 +; RV32I-NEXT: sub a3, a4, a0 +; RV32I-NEXT: addi a2, a5, -1 ; RV32I-NEXT: j .LBB7_1 ; RV32I-NEXT: .LBB7_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a0, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1274,35 +1274,35 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a0 -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: lw a4, 0(a0) +; RV32IA-NEXT: lw a5, 0(a0) +; RV32IA-NEXT: lw a4, 4(a0) ; RV32IA-NEXT: mv s1, a2 ; RV32IA-NEXT: mv s2, a1 ; RV32IA-NEXT: j .LBB7_2 ; RV32IA-NEXT: .LBB7_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a5, 8(sp) +; RV32IA-NEXT: sw a4, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 ; RV32IA-NEXT: mv a0, s0 ; RV32IA-NEXT: call __atomic_compare_exchange_8@plt -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: lw a4, 8(sp) +; RV32IA-NEXT: lw a5, 8(sp) +; RV32IA-NEXT: lw a4, 12(sp) ; RV32IA-NEXT: bnez a0, .LBB7_7 ; RV32IA-NEXT: .LBB7_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s1, .LBB7_4 +; RV32IA-NEXT: beq a4, s1, .LBB7_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sltu a0, s1, a5 +; RV32IA-NEXT: sltu a0, s1, a4 ; RV32IA-NEXT: j .LBB7_5 ; RV32IA-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a0, s2, a5 ; RV32IA-NEXT: .LBB7_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: or a1, a4, a5 +; RV32IA-NEXT: or a1, a5, a4 ; RV32IA-NEXT: seqz a1, a1 ; RV32IA-NEXT: or a0, a1, a0 ; RV32IA-NEXT: mv a2, s2 @@ -1310,13 +1310,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: bnez a0, .LBB7_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: seqz a0, a4 -; RV32IA-NEXT: sub a3, a5, a0 -; RV32IA-NEXT: addi a2, a4, -1 +; RV32IA-NEXT: seqz a0, a5 +; RV32IA-NEXT: sub a3, a4, a0 +; RV32IA-NEXT: addi a2, a5, -1 ; RV32IA-NEXT: j .LBB7_1 ; RV32IA-NEXT: .LBB7_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 +; RV32IA-NEXT: mv a0, a5 +; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/byval.ll b/llvm/test/CodeGen/RISCV/byval.ll index d300542e08075..368acaa1ca264 100644 --- a/llvm/test/CodeGen/RISCV/byval.ll +++ b/llvm/test/CodeGen/RISCV/byval.ll @@ -26,10 +26,10 @@ define void @caller() nounwind { ; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a0, a0, %lo(foo) ; RV32I-NEXT: lw a1, 12(a0) -; RV32I-NEXT: sw a1, 24(sp) -; RV32I-NEXT: lw a1, 8(a0) -; RV32I-NEXT: sw a1, 20(sp) +; RV32I-NEXT: lw a2, 8(a0) ; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: sw a1, 24(sp) +; RV32I-NEXT: sw a2, 20(sp) ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: addi a0, sp, 12 ; RV32I-NEXT: call callee@plt diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll index 7111316931f19..7aeaaab68a208 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll @@ -25,69 +25,69 @@ define void @callee() nounwind { ; ILP32: # %bb.0: ; ILP32-NEXT: lui a0, %hi(var) ; ILP32-NEXT: flw fa5, %lo(var)(a0) -; ILP32-NEXT: flw fa4, %lo(var+4)(a0) -; ILP32-NEXT: flw fa3, %lo(var+8)(a0) -; ILP32-NEXT: flw fa2, %lo(var+12)(a0) ; ILP32-NEXT: addi a1, a0, %lo(var) -; ILP32-NEXT: flw fa1, 16(a1) -; ILP32-NEXT: flw fa0, 20(a1) -; ILP32-NEXT: flw ft0, 24(a1) -; ILP32-NEXT: flw ft1, 28(a1) -; ILP32-NEXT: flw ft2, 32(a1) -; ILP32-NEXT: flw ft3, 36(a1) -; ILP32-NEXT: flw ft4, 40(a1) -; ILP32-NEXT: flw ft5, 44(a1) -; ILP32-NEXT: flw ft6, 48(a1) -; ILP32-NEXT: flw ft7, 52(a1) -; ILP32-NEXT: flw fa6, 56(a1) -; ILP32-NEXT: flw fa7, 60(a1) -; ILP32-NEXT: flw ft8, 64(a1) -; ILP32-NEXT: flw ft9, 68(a1) -; ILP32-NEXT: flw ft10, 72(a1) -; ILP32-NEXT: flw ft11, 76(a1) -; ILP32-NEXT: flw fs0, 80(a1) -; ILP32-NEXT: flw fs1, 84(a1) -; ILP32-NEXT: flw fs2, 88(a1) -; ILP32-NEXT: flw fs3, 92(a1) -; ILP32-NEXT: flw fs4, 96(a1) -; ILP32-NEXT: flw fs5, 100(a1) -; ILP32-NEXT: flw fs6, 104(a1) -; ILP32-NEXT: flw fs7, 108(a1) +; ILP32-NEXT: flw fa4, 16(a1) +; ILP32-NEXT: flw fa3, 20(a1) +; ILP32-NEXT: flw fa2, 24(a1) +; ILP32-NEXT: flw fa1, 28(a1) +; ILP32-NEXT: flw fa0, 32(a1) +; ILP32-NEXT: flw ft0, 36(a1) +; ILP32-NEXT: flw ft1, 40(a1) +; ILP32-NEXT: flw ft2, 44(a1) +; ILP32-NEXT: flw ft3, 48(a1) +; ILP32-NEXT: flw ft4, 52(a1) +; ILP32-NEXT: flw ft5, 56(a1) +; ILP32-NEXT: flw ft6, 60(a1) +; ILP32-NEXT: flw ft7, 64(a1) +; ILP32-NEXT: flw fa6, 68(a1) +; ILP32-NEXT: flw fa7, 72(a1) +; ILP32-NEXT: flw ft8, 76(a1) +; ILP32-NEXT: flw ft9, 80(a1) +; ILP32-NEXT: flw ft10, 84(a1) +; ILP32-NEXT: flw ft11, 88(a1) +; ILP32-NEXT: flw fs0, 92(a1) +; ILP32-NEXT: flw fs1, 96(a1) +; ILP32-NEXT: flw fs2, 100(a1) +; ILP32-NEXT: flw fs3, 104(a1) +; ILP32-NEXT: flw fs4, 108(a1) +; ILP32-NEXT: flw fs5, 112(a1) +; ILP32-NEXT: flw fs6, 116(a1) +; ILP32-NEXT: flw fs7, 120(a1) ; ILP32-NEXT: flw fs8, 124(a1) -; ILP32-NEXT: flw fs9, 120(a1) -; ILP32-NEXT: flw fs10, 116(a1) -; ILP32-NEXT: flw fs11, 112(a1) +; ILP32-NEXT: flw fs9, %lo(var+4)(a0) +; ILP32-NEXT: flw fs10, %lo(var+8)(a0) +; ILP32-NEXT: flw fs11, %lo(var+12)(a0) ; ILP32-NEXT: fsw fs8, 124(a1) -; ILP32-NEXT: fsw fs9, 120(a1) -; ILP32-NEXT: fsw fs10, 116(a1) -; ILP32-NEXT: fsw fs11, 112(a1) -; ILP32-NEXT: fsw fs7, 108(a1) -; ILP32-NEXT: fsw fs6, 104(a1) -; ILP32-NEXT: fsw fs5, 100(a1) -; ILP32-NEXT: fsw fs4, 96(a1) -; ILP32-NEXT: fsw fs3, 92(a1) -; ILP32-NEXT: fsw fs2, 88(a1) -; ILP32-NEXT: fsw fs1, 84(a1) -; ILP32-NEXT: fsw fs0, 80(a1) -; ILP32-NEXT: fsw ft11, 76(a1) -; ILP32-NEXT: fsw ft10, 72(a1) -; ILP32-NEXT: fsw ft9, 68(a1) -; ILP32-NEXT: fsw ft8, 64(a1) -; ILP32-NEXT: fsw fa7, 60(a1) -; ILP32-NEXT: fsw fa6, 56(a1) -; ILP32-NEXT: fsw ft7, 52(a1) -; ILP32-NEXT: fsw ft6, 48(a1) -; ILP32-NEXT: fsw ft5, 44(a1) -; ILP32-NEXT: fsw ft4, 40(a1) -; ILP32-NEXT: fsw ft3, 36(a1) -; ILP32-NEXT: fsw ft2, 32(a1) -; ILP32-NEXT: fsw ft1, 28(a1) -; ILP32-NEXT: fsw ft0, 24(a1) -; ILP32-NEXT: fsw fa0, 20(a1) -; ILP32-NEXT: fsw fa1, 16(a1) -; ILP32-NEXT: fsw fa2, %lo(var+12)(a0) -; ILP32-NEXT: fsw fa3, %lo(var+8)(a0) -; ILP32-NEXT: fsw fa4, %lo(var+4)(a0) +; ILP32-NEXT: fsw fs7, 120(a1) +; ILP32-NEXT: fsw fs6, 116(a1) +; ILP32-NEXT: fsw fs5, 112(a1) +; ILP32-NEXT: fsw fs4, 108(a1) +; ILP32-NEXT: fsw fs3, 104(a1) +; ILP32-NEXT: fsw fs2, 100(a1) +; ILP32-NEXT: fsw fs1, 96(a1) +; ILP32-NEXT: fsw fs0, 92(a1) +; ILP32-NEXT: fsw ft11, 88(a1) +; ILP32-NEXT: fsw ft10, 84(a1) +; ILP32-NEXT: fsw ft9, 80(a1) +; ILP32-NEXT: fsw ft8, 76(a1) +; ILP32-NEXT: fsw fa7, 72(a1) +; ILP32-NEXT: fsw fa6, 68(a1) +; ILP32-NEXT: fsw ft7, 64(a1) +; ILP32-NEXT: fsw ft6, 60(a1) +; ILP32-NEXT: fsw ft5, 56(a1) +; ILP32-NEXT: fsw ft4, 52(a1) +; ILP32-NEXT: fsw ft3, 48(a1) +; ILP32-NEXT: fsw ft2, 44(a1) +; ILP32-NEXT: fsw ft1, 40(a1) +; ILP32-NEXT: fsw ft0, 36(a1) +; ILP32-NEXT: fsw fa0, 32(a1) +; ILP32-NEXT: fsw fa1, 28(a1) +; ILP32-NEXT: fsw fa2, 24(a1) +; ILP32-NEXT: fsw fa3, 20(a1) +; ILP32-NEXT: fsw fa4, 16(a1) +; ILP32-NEXT: fsw fs11, %lo(var+12)(a0) +; ILP32-NEXT: fsw fs10, %lo(var+8)(a0) +; ILP32-NEXT: fsw fs9, %lo(var+4)(a0) ; ILP32-NEXT: fsw fa5, %lo(var)(a0) ; ILP32-NEXT: ret ; @@ -95,69 +95,69 @@ define void @callee() nounwind { ; LP64: # %bb.0: ; LP64-NEXT: lui a0, %hi(var) ; LP64-NEXT: flw fa5, %lo(var)(a0) -; LP64-NEXT: flw fa4, %lo(var+4)(a0) -; LP64-NEXT: flw fa3, %lo(var+8)(a0) -; LP64-NEXT: flw fa2, %lo(var+12)(a0) ; LP64-NEXT: addi a1, a0, %lo(var) -; LP64-NEXT: flw fa1, 16(a1) -; LP64-NEXT: flw fa0, 20(a1) -; LP64-NEXT: flw ft0, 24(a1) -; LP64-NEXT: flw ft1, 28(a1) -; LP64-NEXT: flw ft2, 32(a1) -; LP64-NEXT: flw ft3, 36(a1) -; LP64-NEXT: flw ft4, 40(a1) -; LP64-NEXT: flw ft5, 44(a1) -; LP64-NEXT: flw ft6, 48(a1) -; LP64-NEXT: flw ft7, 52(a1) -; LP64-NEXT: flw fa6, 56(a1) -; LP64-NEXT: flw fa7, 60(a1) -; LP64-NEXT: flw ft8, 64(a1) -; LP64-NEXT: flw ft9, 68(a1) -; LP64-NEXT: flw ft10, 72(a1) -; LP64-NEXT: flw ft11, 76(a1) -; LP64-NEXT: flw fs0, 80(a1) -; LP64-NEXT: flw fs1, 84(a1) -; LP64-NEXT: flw fs2, 88(a1) -; LP64-NEXT: flw fs3, 92(a1) -; LP64-NEXT: flw fs4, 96(a1) -; LP64-NEXT: flw fs5, 100(a1) -; LP64-NEXT: flw fs6, 104(a1) -; LP64-NEXT: flw fs7, 108(a1) +; LP64-NEXT: flw fa4, 16(a1) +; LP64-NEXT: flw fa3, 20(a1) +; LP64-NEXT: flw fa2, 24(a1) +; LP64-NEXT: flw fa1, 28(a1) +; LP64-NEXT: flw fa0, 32(a1) +; LP64-NEXT: flw ft0, 36(a1) +; LP64-NEXT: flw ft1, 40(a1) +; LP64-NEXT: flw ft2, 44(a1) +; LP64-NEXT: flw ft3, 48(a1) +; LP64-NEXT: flw ft4, 52(a1) +; LP64-NEXT: flw ft5, 56(a1) +; LP64-NEXT: flw ft6, 60(a1) +; LP64-NEXT: flw ft7, 64(a1) +; LP64-NEXT: flw fa6, 68(a1) +; LP64-NEXT: flw fa7, 72(a1) +; LP64-NEXT: flw ft8, 76(a1) +; LP64-NEXT: flw ft9, 80(a1) +; LP64-NEXT: flw ft10, 84(a1) +; LP64-NEXT: flw ft11, 88(a1) +; LP64-NEXT: flw fs0, 92(a1) +; LP64-NEXT: flw fs1, 96(a1) +; LP64-NEXT: flw fs2, 100(a1) +; LP64-NEXT: flw fs3, 104(a1) +; LP64-NEXT: flw fs4, 108(a1) +; LP64-NEXT: flw fs5, 112(a1) +; LP64-NEXT: flw fs6, 116(a1) +; LP64-NEXT: flw fs7, 120(a1) ; LP64-NEXT: flw fs8, 124(a1) -; LP64-NEXT: flw fs9, 120(a1) -; LP64-NEXT: flw fs10, 116(a1) -; LP64-NEXT: flw fs11, 112(a1) +; LP64-NEXT: flw fs9, %lo(var+4)(a0) +; LP64-NEXT: flw fs10, %lo(var+8)(a0) +; LP64-NEXT: flw fs11, %lo(var+12)(a0) ; LP64-NEXT: fsw fs8, 124(a1) -; LP64-NEXT: fsw fs9, 120(a1) -; LP64-NEXT: fsw fs10, 116(a1) -; LP64-NEXT: fsw fs11, 112(a1) -; LP64-NEXT: fsw fs7, 108(a1) -; LP64-NEXT: fsw fs6, 104(a1) -; LP64-NEXT: fsw fs5, 100(a1) -; LP64-NEXT: fsw fs4, 96(a1) -; LP64-NEXT: fsw fs3, 92(a1) -; LP64-NEXT: fsw fs2, 88(a1) -; LP64-NEXT: fsw fs1, 84(a1) -; LP64-NEXT: fsw fs0, 80(a1) -; LP64-NEXT: fsw ft11, 76(a1) -; LP64-NEXT: fsw ft10, 72(a1) -; LP64-NEXT: fsw ft9, 68(a1) -; LP64-NEXT: fsw ft8, 64(a1) -; LP64-NEXT: fsw fa7, 60(a1) -; LP64-NEXT: fsw fa6, 56(a1) -; LP64-NEXT: fsw ft7, 52(a1) -; LP64-NEXT: fsw ft6, 48(a1) -; LP64-NEXT: fsw ft5, 44(a1) -; LP64-NEXT: fsw ft4, 40(a1) -; LP64-NEXT: fsw ft3, 36(a1) -; LP64-NEXT: fsw ft2, 32(a1) -; LP64-NEXT: fsw ft1, 28(a1) -; LP64-NEXT: fsw ft0, 24(a1) -; LP64-NEXT: fsw fa0, 20(a1) -; LP64-NEXT: fsw fa1, 16(a1) -; LP64-NEXT: fsw fa2, %lo(var+12)(a0) -; LP64-NEXT: fsw fa3, %lo(var+8)(a0) -; LP64-NEXT: fsw fa4, %lo(var+4)(a0) +; LP64-NEXT: fsw fs7, 120(a1) +; LP64-NEXT: fsw fs6, 116(a1) +; LP64-NEXT: fsw fs5, 112(a1) +; LP64-NEXT: fsw fs4, 108(a1) +; LP64-NEXT: fsw fs3, 104(a1) +; LP64-NEXT: fsw fs2, 100(a1) +; LP64-NEXT: fsw fs1, 96(a1) +; LP64-NEXT: fsw fs0, 92(a1) +; LP64-NEXT: fsw ft11, 88(a1) +; LP64-NEXT: fsw ft10, 84(a1) +; LP64-NEXT: fsw ft9, 80(a1) +; LP64-NEXT: fsw ft8, 76(a1) +; LP64-NEXT: fsw fa7, 72(a1) +; LP64-NEXT: fsw fa6, 68(a1) +; LP64-NEXT: fsw ft7, 64(a1) +; LP64-NEXT: fsw ft6, 60(a1) +; LP64-NEXT: fsw ft5, 56(a1) +; LP64-NEXT: fsw ft4, 52(a1) +; LP64-NEXT: fsw ft3, 48(a1) +; LP64-NEXT: fsw ft2, 44(a1) +; LP64-NEXT: fsw ft1, 40(a1) +; LP64-NEXT: fsw ft0, 36(a1) +; LP64-NEXT: fsw fa0, 32(a1) +; LP64-NEXT: fsw fa1, 28(a1) +; LP64-NEXT: fsw fa2, 24(a1) +; LP64-NEXT: fsw fa3, 20(a1) +; LP64-NEXT: fsw fa4, 16(a1) +; LP64-NEXT: fsw fs11, %lo(var+12)(a0) +; LP64-NEXT: fsw fs10, %lo(var+8)(a0) +; LP64-NEXT: fsw fs9, %lo(var+4)(a0) ; LP64-NEXT: fsw fa5, %lo(var)(a0) ; LP64-NEXT: ret ; @@ -178,69 +178,69 @@ define void @callee() nounwind { ; ILP32F-NEXT: fsw fs11, 0(sp) # 4-byte Folded Spill ; ILP32F-NEXT: lui a0, %hi(var) ; ILP32F-NEXT: flw fa5, %lo(var)(a0) -; ILP32F-NEXT: flw fa4, %lo(var+4)(a0) -; ILP32F-NEXT: flw fa3, %lo(var+8)(a0) -; ILP32F-NEXT: flw fa2, %lo(var+12)(a0) ; ILP32F-NEXT: addi a1, a0, %lo(var) -; ILP32F-NEXT: flw fa1, 16(a1) -; ILP32F-NEXT: flw fa0, 20(a1) -; ILP32F-NEXT: flw ft0, 24(a1) -; ILP32F-NEXT: flw ft1, 28(a1) -; ILP32F-NEXT: flw ft2, 32(a1) -; ILP32F-NEXT: flw ft3, 36(a1) -; ILP32F-NEXT: flw ft4, 40(a1) -; ILP32F-NEXT: flw ft5, 44(a1) -; ILP32F-NEXT: flw ft6, 48(a1) -; ILP32F-NEXT: flw ft7, 52(a1) -; ILP32F-NEXT: flw fa6, 56(a1) -; ILP32F-NEXT: flw fa7, 60(a1) -; ILP32F-NEXT: flw ft8, 64(a1) -; ILP32F-NEXT: flw ft9, 68(a1) -; ILP32F-NEXT: flw ft10, 72(a1) -; ILP32F-NEXT: flw ft11, 76(a1) -; ILP32F-NEXT: flw fs0, 80(a1) -; ILP32F-NEXT: flw fs1, 84(a1) -; ILP32F-NEXT: flw fs2, 88(a1) -; ILP32F-NEXT: flw fs3, 92(a1) -; ILP32F-NEXT: flw fs4, 96(a1) -; ILP32F-NEXT: flw fs5, 100(a1) -; ILP32F-NEXT: flw fs6, 104(a1) -; ILP32F-NEXT: flw fs7, 108(a1) +; ILP32F-NEXT: flw fa4, 16(a1) +; ILP32F-NEXT: flw fa3, 20(a1) +; ILP32F-NEXT: flw fa2, 24(a1) +; ILP32F-NEXT: flw fa1, 28(a1) +; ILP32F-NEXT: flw fa0, 32(a1) +; ILP32F-NEXT: flw ft0, 36(a1) +; ILP32F-NEXT: flw ft1, 40(a1) +; ILP32F-NEXT: flw ft2, 44(a1) +; ILP32F-NEXT: flw ft3, 48(a1) +; ILP32F-NEXT: flw ft4, 52(a1) +; ILP32F-NEXT: flw ft5, 56(a1) +; ILP32F-NEXT: flw ft6, 60(a1) +; ILP32F-NEXT: flw ft7, 64(a1) +; ILP32F-NEXT: flw fa6, 68(a1) +; ILP32F-NEXT: flw fa7, 72(a1) +; ILP32F-NEXT: flw ft8, 76(a1) +; ILP32F-NEXT: flw ft9, 80(a1) +; ILP32F-NEXT: flw ft10, 84(a1) +; ILP32F-NEXT: flw ft11, 88(a1) +; ILP32F-NEXT: flw fs0, 92(a1) +; ILP32F-NEXT: flw fs1, 96(a1) +; ILP32F-NEXT: flw fs2, 100(a1) +; ILP32F-NEXT: flw fs3, 104(a1) +; ILP32F-NEXT: flw fs4, 108(a1) +; ILP32F-NEXT: flw fs5, 112(a1) +; ILP32F-NEXT: flw fs6, 116(a1) +; ILP32F-NEXT: flw fs7, 120(a1) ; ILP32F-NEXT: flw fs8, 124(a1) -; ILP32F-NEXT: flw fs9, 120(a1) -; ILP32F-NEXT: flw fs10, 116(a1) -; ILP32F-NEXT: flw fs11, 112(a1) +; ILP32F-NEXT: flw fs9, %lo(var+4)(a0) +; ILP32F-NEXT: flw fs10, %lo(var+8)(a0) +; ILP32F-NEXT: flw fs11, %lo(var+12)(a0) ; ILP32F-NEXT: fsw fs8, 124(a1) -; ILP32F-NEXT: fsw fs9, 120(a1) -; ILP32F-NEXT: fsw fs10, 116(a1) -; ILP32F-NEXT: fsw fs11, 112(a1) -; ILP32F-NEXT: fsw fs7, 108(a1) -; ILP32F-NEXT: fsw fs6, 104(a1) -; ILP32F-NEXT: fsw fs5, 100(a1) -; ILP32F-NEXT: fsw fs4, 96(a1) -; ILP32F-NEXT: fsw fs3, 92(a1) -; ILP32F-NEXT: fsw fs2, 88(a1) -; ILP32F-NEXT: fsw fs1, 84(a1) -; ILP32F-NEXT: fsw fs0, 80(a1) -; ILP32F-NEXT: fsw ft11, 76(a1) -; ILP32F-NEXT: fsw ft10, 72(a1) -; ILP32F-NEXT: fsw ft9, 68(a1) -; ILP32F-NEXT: fsw ft8, 64(a1) -; ILP32F-NEXT: fsw fa7, 60(a1) -; ILP32F-NEXT: fsw fa6, 56(a1) -; ILP32F-NEXT: fsw ft7, 52(a1) -; ILP32F-NEXT: fsw ft6, 48(a1) -; ILP32F-NEXT: fsw ft5, 44(a1) -; ILP32F-NEXT: fsw ft4, 40(a1) -; ILP32F-NEXT: fsw ft3, 36(a1) -; ILP32F-NEXT: fsw ft2, 32(a1) -; ILP32F-NEXT: fsw ft1, 28(a1) -; ILP32F-NEXT: fsw ft0, 24(a1) -; ILP32F-NEXT: fsw fa0, 20(a1) -; ILP32F-NEXT: fsw fa1, 16(a1) -; ILP32F-NEXT: fsw fa2, %lo(var+12)(a0) -; ILP32F-NEXT: fsw fa3, %lo(var+8)(a0) -; ILP32F-NEXT: fsw fa4, %lo(var+4)(a0) +; ILP32F-NEXT: fsw fs7, 120(a1) +; ILP32F-NEXT: fsw fs6, 116(a1) +; ILP32F-NEXT: fsw fs5, 112(a1) +; ILP32F-NEXT: fsw fs4, 108(a1) +; ILP32F-NEXT: fsw fs3, 104(a1) +; ILP32F-NEXT: fsw fs2, 100(a1) +; ILP32F-NEXT: fsw fs1, 96(a1) +; ILP32F-NEXT: fsw fs0, 92(a1) +; ILP32F-NEXT: fsw ft11, 88(a1) +; ILP32F-NEXT: fsw ft10, 84(a1) +; ILP32F-NEXT: fsw ft9, 80(a1) +; ILP32F-NEXT: fsw ft8, 76(a1) +; ILP32F-NEXT: fsw fa7, 72(a1) +; ILP32F-NEXT: fsw fa6, 68(a1) +; ILP32F-NEXT: fsw ft7, 64(a1) +; ILP32F-NEXT: fsw ft6, 60(a1) +; ILP32F-NEXT: fsw ft5, 56(a1) +; ILP32F-NEXT: fsw ft4, 52(a1) +; ILP32F-NEXT: fsw ft3, 48(a1) +; ILP32F-NEXT: fsw ft2, 44(a1) +; ILP32F-NEXT: fsw ft1, 40(a1) +; ILP32F-NEXT: fsw ft0, 36(a1) +; ILP32F-NEXT: fsw fa0, 32(a1) +; ILP32F-NEXT: fsw fa1, 28(a1) +; ILP32F-NEXT: fsw fa2, 24(a1) +; ILP32F-NEXT: fsw fa3, 20(a1) +; ILP32F-NEXT: fsw fa4, 16(a1) +; ILP32F-NEXT: fsw fs11, %lo(var+12)(a0) +; ILP32F-NEXT: fsw fs10, %lo(var+8)(a0) +; ILP32F-NEXT: fsw fs9, %lo(var+4)(a0) ; ILP32F-NEXT: fsw fa5, %lo(var)(a0) ; ILP32F-NEXT: flw fs0, 44(sp) # 4-byte Folded Reload ; ILP32F-NEXT: flw fs1, 40(sp) # 4-byte Folded Reload @@ -274,69 +274,69 @@ define void @callee() nounwind { ; LP64F-NEXT: fsw fs11, 0(sp) # 4-byte Folded Spill ; LP64F-NEXT: lui a0, %hi(var) ; LP64F-NEXT: flw fa5, %lo(var)(a0) -; LP64F-NEXT: flw fa4, %lo(var+4)(a0) -; LP64F-NEXT: flw fa3, %lo(var+8)(a0) -; LP64F-NEXT: flw fa2, %lo(var+12)(a0) ; LP64F-NEXT: addi a1, a0, %lo(var) -; LP64F-NEXT: flw fa1, 16(a1) -; LP64F-NEXT: flw fa0, 20(a1) -; LP64F-NEXT: flw ft0, 24(a1) -; LP64F-NEXT: flw ft1, 28(a1) -; LP64F-NEXT: flw ft2, 32(a1) -; LP64F-NEXT: flw ft3, 36(a1) -; LP64F-NEXT: flw ft4, 40(a1) -; LP64F-NEXT: flw ft5, 44(a1) -; LP64F-NEXT: flw ft6, 48(a1) -; LP64F-NEXT: flw ft7, 52(a1) -; LP64F-NEXT: flw fa6, 56(a1) -; LP64F-NEXT: flw fa7, 60(a1) -; LP64F-NEXT: flw ft8, 64(a1) -; LP64F-NEXT: flw ft9, 68(a1) -; LP64F-NEXT: flw ft10, 72(a1) -; LP64F-NEXT: flw ft11, 76(a1) -; LP64F-NEXT: flw fs0, 80(a1) -; LP64F-NEXT: flw fs1, 84(a1) -; LP64F-NEXT: flw fs2, 88(a1) -; LP64F-NEXT: flw fs3, 92(a1) -; LP64F-NEXT: flw fs4, 96(a1) -; LP64F-NEXT: flw fs5, 100(a1) -; LP64F-NEXT: flw fs6, 104(a1) -; LP64F-NEXT: flw fs7, 108(a1) +; LP64F-NEXT: flw fa4, 16(a1) +; LP64F-NEXT: flw fa3, 20(a1) +; LP64F-NEXT: flw fa2, 24(a1) +; LP64F-NEXT: flw fa1, 28(a1) +; LP64F-NEXT: flw fa0, 32(a1) +; LP64F-NEXT: flw ft0, 36(a1) +; LP64F-NEXT: flw ft1, 40(a1) +; LP64F-NEXT: flw ft2, 44(a1) +; LP64F-NEXT: flw ft3, 48(a1) +; LP64F-NEXT: flw ft4, 52(a1) +; LP64F-NEXT: flw ft5, 56(a1) +; LP64F-NEXT: flw ft6, 60(a1) +; LP64F-NEXT: flw ft7, 64(a1) +; LP64F-NEXT: flw fa6, 68(a1) +; LP64F-NEXT: flw fa7, 72(a1) +; LP64F-NEXT: flw ft8, 76(a1) +; LP64F-NEXT: flw ft9, 80(a1) +; LP64F-NEXT: flw ft10, 84(a1) +; LP64F-NEXT: flw ft11, 88(a1) +; LP64F-NEXT: flw fs0, 92(a1) +; LP64F-NEXT: flw fs1, 96(a1) +; LP64F-NEXT: flw fs2, 100(a1) +; LP64F-NEXT: flw fs3, 104(a1) +; LP64F-NEXT: flw fs4, 108(a1) +; LP64F-NEXT: flw fs5, 112(a1) +; LP64F-NEXT: flw fs6, 116(a1) +; LP64F-NEXT: flw fs7, 120(a1) ; LP64F-NEXT: flw fs8, 124(a1) -; LP64F-NEXT: flw fs9, 120(a1) -; LP64F-NEXT: flw fs10, 116(a1) -; LP64F-NEXT: flw fs11, 112(a1) +; LP64F-NEXT: flw fs9, %lo(var+4)(a0) +; LP64F-NEXT: flw fs10, %lo(var+8)(a0) +; LP64F-NEXT: flw fs11, %lo(var+12)(a0) ; LP64F-NEXT: fsw fs8, 124(a1) -; LP64F-NEXT: fsw fs9, 120(a1) -; LP64F-NEXT: fsw fs10, 116(a1) -; LP64F-NEXT: fsw fs11, 112(a1) -; LP64F-NEXT: fsw fs7, 108(a1) -; LP64F-NEXT: fsw fs6, 104(a1) -; LP64F-NEXT: fsw fs5, 100(a1) -; LP64F-NEXT: fsw fs4, 96(a1) -; LP64F-NEXT: fsw fs3, 92(a1) -; LP64F-NEXT: fsw fs2, 88(a1) -; LP64F-NEXT: fsw fs1, 84(a1) -; LP64F-NEXT: fsw fs0, 80(a1) -; LP64F-NEXT: fsw ft11, 76(a1) -; LP64F-NEXT: fsw ft10, 72(a1) -; LP64F-NEXT: fsw ft9, 68(a1) -; LP64F-NEXT: fsw ft8, 64(a1) -; LP64F-NEXT: fsw fa7, 60(a1) -; LP64F-NEXT: fsw fa6, 56(a1) -; LP64F-NEXT: fsw ft7, 52(a1) -; LP64F-NEXT: fsw ft6, 48(a1) -; LP64F-NEXT: fsw ft5, 44(a1) -; LP64F-NEXT: fsw ft4, 40(a1) -; LP64F-NEXT: fsw ft3, 36(a1) -; LP64F-NEXT: fsw ft2, 32(a1) -; LP64F-NEXT: fsw ft1, 28(a1) -; LP64F-NEXT: fsw ft0, 24(a1) -; LP64F-NEXT: fsw fa0, 20(a1) -; LP64F-NEXT: fsw fa1, 16(a1) -; LP64F-NEXT: fsw fa2, %lo(var+12)(a0) -; LP64F-NEXT: fsw fa3, %lo(var+8)(a0) -; LP64F-NEXT: fsw fa4, %lo(var+4)(a0) +; LP64F-NEXT: fsw fs7, 120(a1) +; LP64F-NEXT: fsw fs6, 116(a1) +; LP64F-NEXT: fsw fs5, 112(a1) +; LP64F-NEXT: fsw fs4, 108(a1) +; LP64F-NEXT: fsw fs3, 104(a1) +; LP64F-NEXT: fsw fs2, 100(a1) +; LP64F-NEXT: fsw fs1, 96(a1) +; LP64F-NEXT: fsw fs0, 92(a1) +; LP64F-NEXT: fsw ft11, 88(a1) +; LP64F-NEXT: fsw ft10, 84(a1) +; LP64F-NEXT: fsw ft9, 80(a1) +; LP64F-NEXT: fsw ft8, 76(a1) +; LP64F-NEXT: fsw fa7, 72(a1) +; LP64F-NEXT: fsw fa6, 68(a1) +; LP64F-NEXT: fsw ft7, 64(a1) +; LP64F-NEXT: fsw ft6, 60(a1) +; LP64F-NEXT: fsw ft5, 56(a1) +; LP64F-NEXT: fsw ft4, 52(a1) +; LP64F-NEXT: fsw ft3, 48(a1) +; LP64F-NEXT: fsw ft2, 44(a1) +; LP64F-NEXT: fsw ft1, 40(a1) +; LP64F-NEXT: fsw ft0, 36(a1) +; LP64F-NEXT: fsw fa0, 32(a1) +; LP64F-NEXT: fsw fa1, 28(a1) +; LP64F-NEXT: fsw fa2, 24(a1) +; LP64F-NEXT: fsw fa3, 20(a1) +; LP64F-NEXT: fsw fa4, 16(a1) +; LP64F-NEXT: fsw fs11, %lo(var+12)(a0) +; LP64F-NEXT: fsw fs10, %lo(var+8)(a0) +; LP64F-NEXT: fsw fs9, %lo(var+4)(a0) ; LP64F-NEXT: fsw fa5, %lo(var)(a0) ; LP64F-NEXT: flw fs0, 44(sp) # 4-byte Folded Reload ; LP64F-NEXT: flw fs1, 40(sp) # 4-byte Folded Reload @@ -370,69 +370,69 @@ define void @callee() nounwind { ; ILP32D-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill ; ILP32D-NEXT: lui a0, %hi(var) ; ILP32D-NEXT: flw fa5, %lo(var)(a0) -; ILP32D-NEXT: flw fa4, %lo(var+4)(a0) -; ILP32D-NEXT: flw fa3, %lo(var+8)(a0) -; ILP32D-NEXT: flw fa2, %lo(var+12)(a0) ; ILP32D-NEXT: addi a1, a0, %lo(var) -; ILP32D-NEXT: flw fa1, 16(a1) -; ILP32D-NEXT: flw fa0, 20(a1) -; ILP32D-NEXT: flw ft0, 24(a1) -; ILP32D-NEXT: flw ft1, 28(a1) -; ILP32D-NEXT: flw ft2, 32(a1) -; ILP32D-NEXT: flw ft3, 36(a1) -; ILP32D-NEXT: flw ft4, 40(a1) -; ILP32D-NEXT: flw ft5, 44(a1) -; ILP32D-NEXT: flw ft6, 48(a1) -; ILP32D-NEXT: flw ft7, 52(a1) -; ILP32D-NEXT: flw fa6, 56(a1) -; ILP32D-NEXT: flw fa7, 60(a1) -; ILP32D-NEXT: flw ft8, 64(a1) -; ILP32D-NEXT: flw ft9, 68(a1) -; ILP32D-NEXT: flw ft10, 72(a1) -; ILP32D-NEXT: flw ft11, 76(a1) -; ILP32D-NEXT: flw fs0, 80(a1) -; ILP32D-NEXT: flw fs1, 84(a1) -; ILP32D-NEXT: flw fs2, 88(a1) -; ILP32D-NEXT: flw fs3, 92(a1) -; ILP32D-NEXT: flw fs4, 96(a1) -; ILP32D-NEXT: flw fs5, 100(a1) -; ILP32D-NEXT: flw fs6, 104(a1) -; ILP32D-NEXT: flw fs7, 108(a1) +; ILP32D-NEXT: flw fa4, 16(a1) +; ILP32D-NEXT: flw fa3, 20(a1) +; ILP32D-NEXT: flw fa2, 24(a1) +; ILP32D-NEXT: flw fa1, 28(a1) +; ILP32D-NEXT: flw fa0, 32(a1) +; ILP32D-NEXT: flw ft0, 36(a1) +; ILP32D-NEXT: flw ft1, 40(a1) +; ILP32D-NEXT: flw ft2, 44(a1) +; ILP32D-NEXT: flw ft3, 48(a1) +; ILP32D-NEXT: flw ft4, 52(a1) +; ILP32D-NEXT: flw ft5, 56(a1) +; ILP32D-NEXT: flw ft6, 60(a1) +; ILP32D-NEXT: flw ft7, 64(a1) +; ILP32D-NEXT: flw fa6, 68(a1) +; ILP32D-NEXT: flw fa7, 72(a1) +; ILP32D-NEXT: flw ft8, 76(a1) +; ILP32D-NEXT: flw ft9, 80(a1) +; ILP32D-NEXT: flw ft10, 84(a1) +; ILP32D-NEXT: flw ft11, 88(a1) +; ILP32D-NEXT: flw fs0, 92(a1) +; ILP32D-NEXT: flw fs1, 96(a1) +; ILP32D-NEXT: flw fs2, 100(a1) +; ILP32D-NEXT: flw fs3, 104(a1) +; ILP32D-NEXT: flw fs4, 108(a1) +; ILP32D-NEXT: flw fs5, 112(a1) +; ILP32D-NEXT: flw fs6, 116(a1) +; ILP32D-NEXT: flw fs7, 120(a1) ; ILP32D-NEXT: flw fs8, 124(a1) -; ILP32D-NEXT: flw fs9, 120(a1) -; ILP32D-NEXT: flw fs10, 116(a1) -; ILP32D-NEXT: flw fs11, 112(a1) +; ILP32D-NEXT: flw fs9, %lo(var+4)(a0) +; ILP32D-NEXT: flw fs10, %lo(var+8)(a0) +; ILP32D-NEXT: flw fs11, %lo(var+12)(a0) ; ILP32D-NEXT: fsw fs8, 124(a1) -; ILP32D-NEXT: fsw fs9, 120(a1) -; ILP32D-NEXT: fsw fs10, 116(a1) -; ILP32D-NEXT: fsw fs11, 112(a1) -; ILP32D-NEXT: fsw fs7, 108(a1) -; ILP32D-NEXT: fsw fs6, 104(a1) -; ILP32D-NEXT: fsw fs5, 100(a1) -; ILP32D-NEXT: fsw fs4, 96(a1) -; ILP32D-NEXT: fsw fs3, 92(a1) -; ILP32D-NEXT: fsw fs2, 88(a1) -; ILP32D-NEXT: fsw fs1, 84(a1) -; ILP32D-NEXT: fsw fs0, 80(a1) -; ILP32D-NEXT: fsw ft11, 76(a1) -; ILP32D-NEXT: fsw ft10, 72(a1) -; ILP32D-NEXT: fsw ft9, 68(a1) -; ILP32D-NEXT: fsw ft8, 64(a1) -; ILP32D-NEXT: fsw fa7, 60(a1) -; ILP32D-NEXT: fsw fa6, 56(a1) -; ILP32D-NEXT: fsw ft7, 52(a1) -; ILP32D-NEXT: fsw ft6, 48(a1) -; ILP32D-NEXT: fsw ft5, 44(a1) -; ILP32D-NEXT: fsw ft4, 40(a1) -; ILP32D-NEXT: fsw ft3, 36(a1) -; ILP32D-NEXT: fsw ft2, 32(a1) -; ILP32D-NEXT: fsw ft1, 28(a1) -; ILP32D-NEXT: fsw ft0, 24(a1) -; ILP32D-NEXT: fsw fa0, 20(a1) -; ILP32D-NEXT: fsw fa1, 16(a1) -; ILP32D-NEXT: fsw fa2, %lo(var+12)(a0) -; ILP32D-NEXT: fsw fa3, %lo(var+8)(a0) -; ILP32D-NEXT: fsw fa4, %lo(var+4)(a0) +; ILP32D-NEXT: fsw fs7, 120(a1) +; ILP32D-NEXT: fsw fs6, 116(a1) +; ILP32D-NEXT: fsw fs5, 112(a1) +; ILP32D-NEXT: fsw fs4, 108(a1) +; ILP32D-NEXT: fsw fs3, 104(a1) +; ILP32D-NEXT: fsw fs2, 100(a1) +; ILP32D-NEXT: fsw fs1, 96(a1) +; ILP32D-NEXT: fsw fs0, 92(a1) +; ILP32D-NEXT: fsw ft11, 88(a1) +; ILP32D-NEXT: fsw ft10, 84(a1) +; ILP32D-NEXT: fsw ft9, 80(a1) +; ILP32D-NEXT: fsw ft8, 76(a1) +; ILP32D-NEXT: fsw fa7, 72(a1) +; ILP32D-NEXT: fsw fa6, 68(a1) +; ILP32D-NEXT: fsw ft7, 64(a1) +; ILP32D-NEXT: fsw ft6, 60(a1) +; ILP32D-NEXT: fsw ft5, 56(a1) +; ILP32D-NEXT: fsw ft4, 52(a1) +; ILP32D-NEXT: fsw ft3, 48(a1) +; ILP32D-NEXT: fsw ft2, 44(a1) +; ILP32D-NEXT: fsw ft1, 40(a1) +; ILP32D-NEXT: fsw ft0, 36(a1) +; ILP32D-NEXT: fsw fa0, 32(a1) +; ILP32D-NEXT: fsw fa1, 28(a1) +; ILP32D-NEXT: fsw fa2, 24(a1) +; ILP32D-NEXT: fsw fa3, 20(a1) +; ILP32D-NEXT: fsw fa4, 16(a1) +; ILP32D-NEXT: fsw fs11, %lo(var+12)(a0) +; ILP32D-NEXT: fsw fs10, %lo(var+8)(a0) +; ILP32D-NEXT: fsw fs9, %lo(var+4)(a0) ; ILP32D-NEXT: fsw fa5, %lo(var)(a0) ; ILP32D-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload ; ILP32D-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload @@ -466,69 +466,69 @@ define void @callee() nounwind { ; LP64D-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill ; LP64D-NEXT: lui a0, %hi(var) ; LP64D-NEXT: flw fa5, %lo(var)(a0) -; LP64D-NEXT: flw fa4, %lo(var+4)(a0) -; LP64D-NEXT: flw fa3, %lo(var+8)(a0) -; LP64D-NEXT: flw fa2, %lo(var+12)(a0) ; LP64D-NEXT: addi a1, a0, %lo(var) -; LP64D-NEXT: flw fa1, 16(a1) -; LP64D-NEXT: flw fa0, 20(a1) -; LP64D-NEXT: flw ft0, 24(a1) -; LP64D-NEXT: flw ft1, 28(a1) -; LP64D-NEXT: flw ft2, 32(a1) -; LP64D-NEXT: flw ft3, 36(a1) -; LP64D-NEXT: flw ft4, 40(a1) -; LP64D-NEXT: flw ft5, 44(a1) -; LP64D-NEXT: flw ft6, 48(a1) -; LP64D-NEXT: flw ft7, 52(a1) -; LP64D-NEXT: flw fa6, 56(a1) -; LP64D-NEXT: flw fa7, 60(a1) -; LP64D-NEXT: flw ft8, 64(a1) -; LP64D-NEXT: flw ft9, 68(a1) -; LP64D-NEXT: flw ft10, 72(a1) -; LP64D-NEXT: flw ft11, 76(a1) -; LP64D-NEXT: flw fs0, 80(a1) -; LP64D-NEXT: flw fs1, 84(a1) -; LP64D-NEXT: flw fs2, 88(a1) -; LP64D-NEXT: flw fs3, 92(a1) -; LP64D-NEXT: flw fs4, 96(a1) -; LP64D-NEXT: flw fs5, 100(a1) -; LP64D-NEXT: flw fs6, 104(a1) -; LP64D-NEXT: flw fs7, 108(a1) +; LP64D-NEXT: flw fa4, 16(a1) +; LP64D-NEXT: flw fa3, 20(a1) +; LP64D-NEXT: flw fa2, 24(a1) +; LP64D-NEXT: flw fa1, 28(a1) +; LP64D-NEXT: flw fa0, 32(a1) +; LP64D-NEXT: flw ft0, 36(a1) +; LP64D-NEXT: flw ft1, 40(a1) +; LP64D-NEXT: flw ft2, 44(a1) +; LP64D-NEXT: flw ft3, 48(a1) +; LP64D-NEXT: flw ft4, 52(a1) +; LP64D-NEXT: flw ft5, 56(a1) +; LP64D-NEXT: flw ft6, 60(a1) +; LP64D-NEXT: flw ft7, 64(a1) +; LP64D-NEXT: flw fa6, 68(a1) +; LP64D-NEXT: flw fa7, 72(a1) +; LP64D-NEXT: flw ft8, 76(a1) +; LP64D-NEXT: flw ft9, 80(a1) +; LP64D-NEXT: flw ft10, 84(a1) +; LP64D-NEXT: flw ft11, 88(a1) +; LP64D-NEXT: flw fs0, 92(a1) +; LP64D-NEXT: flw fs1, 96(a1) +; LP64D-NEXT: flw fs2, 100(a1) +; LP64D-NEXT: flw fs3, 104(a1) +; LP64D-NEXT: flw fs4, 108(a1) +; LP64D-NEXT: flw fs5, 112(a1) +; LP64D-NEXT: flw fs6, 116(a1) +; LP64D-NEXT: flw fs7, 120(a1) ; LP64D-NEXT: flw fs8, 124(a1) -; LP64D-NEXT: flw fs9, 120(a1) -; LP64D-NEXT: flw fs10, 116(a1) -; LP64D-NEXT: flw fs11, 112(a1) +; LP64D-NEXT: flw fs9, %lo(var+4)(a0) +; LP64D-NEXT: flw fs10, %lo(var+8)(a0) +; LP64D-NEXT: flw fs11, %lo(var+12)(a0) ; LP64D-NEXT: fsw fs8, 124(a1) -; LP64D-NEXT: fsw fs9, 120(a1) -; LP64D-NEXT: fsw fs10, 116(a1) -; LP64D-NEXT: fsw fs11, 112(a1) -; LP64D-NEXT: fsw fs7, 108(a1) -; LP64D-NEXT: fsw fs6, 104(a1) -; LP64D-NEXT: fsw fs5, 100(a1) -; LP64D-NEXT: fsw fs4, 96(a1) -; LP64D-NEXT: fsw fs3, 92(a1) -; LP64D-NEXT: fsw fs2, 88(a1) -; LP64D-NEXT: fsw fs1, 84(a1) -; LP64D-NEXT: fsw fs0, 80(a1) -; LP64D-NEXT: fsw ft11, 76(a1) -; LP64D-NEXT: fsw ft10, 72(a1) -; LP64D-NEXT: fsw ft9, 68(a1) -; LP64D-NEXT: fsw ft8, 64(a1) -; LP64D-NEXT: fsw fa7, 60(a1) -; LP64D-NEXT: fsw fa6, 56(a1) -; LP64D-NEXT: fsw ft7, 52(a1) -; LP64D-NEXT: fsw ft6, 48(a1) -; LP64D-NEXT: fsw ft5, 44(a1) -; LP64D-NEXT: fsw ft4, 40(a1) -; LP64D-NEXT: fsw ft3, 36(a1) -; LP64D-NEXT: fsw ft2, 32(a1) -; LP64D-NEXT: fsw ft1, 28(a1) -; LP64D-NEXT: fsw ft0, 24(a1) -; LP64D-NEXT: fsw fa0, 20(a1) -; LP64D-NEXT: fsw fa1, 16(a1) -; LP64D-NEXT: fsw fa2, %lo(var+12)(a0) -; LP64D-NEXT: fsw fa3, %lo(var+8)(a0) -; LP64D-NEXT: fsw fa4, %lo(var+4)(a0) +; LP64D-NEXT: fsw fs7, 120(a1) +; LP64D-NEXT: fsw fs6, 116(a1) +; LP64D-NEXT: fsw fs5, 112(a1) +; LP64D-NEXT: fsw fs4, 108(a1) +; LP64D-NEXT: fsw fs3, 104(a1) +; LP64D-NEXT: fsw fs2, 100(a1) +; LP64D-NEXT: fsw fs1, 96(a1) +; LP64D-NEXT: fsw fs0, 92(a1) +; LP64D-NEXT: fsw ft11, 88(a1) +; LP64D-NEXT: fsw ft10, 84(a1) +; LP64D-NEXT: fsw ft9, 80(a1) +; LP64D-NEXT: fsw ft8, 76(a1) +; LP64D-NEXT: fsw fa7, 72(a1) +; LP64D-NEXT: fsw fa6, 68(a1) +; LP64D-NEXT: fsw ft7, 64(a1) +; LP64D-NEXT: fsw ft6, 60(a1) +; LP64D-NEXT: fsw ft5, 56(a1) +; LP64D-NEXT: fsw ft4, 52(a1) +; LP64D-NEXT: fsw ft3, 48(a1) +; LP64D-NEXT: fsw ft2, 44(a1) +; LP64D-NEXT: fsw ft1, 40(a1) +; LP64D-NEXT: fsw ft0, 36(a1) +; LP64D-NEXT: fsw fa0, 32(a1) +; LP64D-NEXT: fsw fa1, 28(a1) +; LP64D-NEXT: fsw fa2, 24(a1) +; LP64D-NEXT: fsw fa3, 20(a1) +; LP64D-NEXT: fsw fa4, 16(a1) +; LP64D-NEXT: fsw fs11, %lo(var+12)(a0) +; LP64D-NEXT: fsw fs10, %lo(var+8)(a0) +; LP64D-NEXT: fsw fs9, %lo(var+4)(a0) ; LP64D-NEXT: fsw fa5, %lo(var)(a0) ; LP64D-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload ; LP64D-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll index 40076316bca89..a7f582f5f0699 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll @@ -20,141 +20,141 @@ define void @callee() nounwind { ; ILP32-LABEL: callee: ; ILP32: # %bb.0: ; ILP32-NEXT: lui a0, %hi(var) -; ILP32-NEXT: fld fa5, %lo(var)(a0) -; ILP32-NEXT: fld fa4, %lo(var+8)(a0) ; ILP32-NEXT: addi a1, a0, %lo(var) -; ILP32-NEXT: fld fa3, 16(a1) -; ILP32-NEXT: fld fa2, 24(a1) -; ILP32-NEXT: fld fa1, 32(a1) -; ILP32-NEXT: fld fa0, 40(a1) -; ILP32-NEXT: fld ft0, 48(a1) -; ILP32-NEXT: fld ft1, 56(a1) -; ILP32-NEXT: fld ft2, 64(a1) -; ILP32-NEXT: fld ft3, 72(a1) -; ILP32-NEXT: fld ft4, 80(a1) -; ILP32-NEXT: fld ft5, 88(a1) -; ILP32-NEXT: fld ft6, 96(a1) -; ILP32-NEXT: fld ft7, 104(a1) -; ILP32-NEXT: fld fa6, 112(a1) -; ILP32-NEXT: fld fa7, 120(a1) -; ILP32-NEXT: fld ft8, 128(a1) -; ILP32-NEXT: fld ft9, 136(a1) -; ILP32-NEXT: fld ft10, 144(a1) -; ILP32-NEXT: fld ft11, 152(a1) -; ILP32-NEXT: fld fs0, 160(a1) -; ILP32-NEXT: fld fs1, 168(a1) -; ILP32-NEXT: fld fs2, 176(a1) -; ILP32-NEXT: fld fs3, 184(a1) -; ILP32-NEXT: fld fs4, 192(a1) -; ILP32-NEXT: fld fs5, 200(a1) -; ILP32-NEXT: fld fs6, 208(a1) -; ILP32-NEXT: fld fs7, 216(a1) -; ILP32-NEXT: fld fs8, 248(a1) +; ILP32-NEXT: fld fa5, 248(a1) +; ILP32-NEXT: fld fa4, 16(a1) +; ILP32-NEXT: fld fa3, 24(a1) +; ILP32-NEXT: fld fa2, 32(a1) +; ILP32-NEXT: fld fa1, 40(a1) +; ILP32-NEXT: fld fa0, 48(a1) +; ILP32-NEXT: fld ft0, 56(a1) +; ILP32-NEXT: fld ft1, 64(a1) +; ILP32-NEXT: fld ft2, 72(a1) +; ILP32-NEXT: fld ft3, 80(a1) +; ILP32-NEXT: fld ft4, 88(a1) +; ILP32-NEXT: fld ft5, 96(a1) +; ILP32-NEXT: fld ft6, 104(a1) +; ILP32-NEXT: fld ft7, 112(a1) +; ILP32-NEXT: fld fa6, 120(a1) +; ILP32-NEXT: fld fa7, 128(a1) +; ILP32-NEXT: fld ft8, 136(a1) +; ILP32-NEXT: fld ft9, 144(a1) +; ILP32-NEXT: fld ft10, 152(a1) +; ILP32-NEXT: fld ft11, 160(a1) +; ILP32-NEXT: fld fs0, 168(a1) +; ILP32-NEXT: fld fs1, 176(a1) +; ILP32-NEXT: fld fs2, 184(a1) +; ILP32-NEXT: fld fs3, 192(a1) +; ILP32-NEXT: fld fs4, 200(a1) +; ILP32-NEXT: fld fs5, 208(a1) +; ILP32-NEXT: fld fs6, 216(a1) +; ILP32-NEXT: fld fs7, 224(a1) +; ILP32-NEXT: fld fs8, 232(a1) ; ILP32-NEXT: fld fs9, 240(a1) -; ILP32-NEXT: fld fs10, 232(a1) -; ILP32-NEXT: fld fs11, 224(a1) -; ILP32-NEXT: fsd fs8, 248(a1) +; ILP32-NEXT: fld fs10, %lo(var)(a0) +; ILP32-NEXT: fld fs11, %lo(var+8)(a0) +; ILP32-NEXT: fsd fa5, 248(a1) ; ILP32-NEXT: fsd fs9, 240(a1) -; ILP32-NEXT: fsd fs10, 232(a1) -; ILP32-NEXT: fsd fs11, 224(a1) -; ILP32-NEXT: fsd fs7, 216(a1) -; ILP32-NEXT: fsd fs6, 208(a1) -; ILP32-NEXT: fsd fs5, 200(a1) -; ILP32-NEXT: fsd fs4, 192(a1) -; ILP32-NEXT: fsd fs3, 184(a1) -; ILP32-NEXT: fsd fs2, 176(a1) -; ILP32-NEXT: fsd fs1, 168(a1) -; ILP32-NEXT: fsd fs0, 160(a1) -; ILP32-NEXT: fsd ft11, 152(a1) -; ILP32-NEXT: fsd ft10, 144(a1) -; ILP32-NEXT: fsd ft9, 136(a1) -; ILP32-NEXT: fsd ft8, 128(a1) -; ILP32-NEXT: fsd fa7, 120(a1) -; ILP32-NEXT: fsd fa6, 112(a1) -; ILP32-NEXT: fsd ft7, 104(a1) -; ILP32-NEXT: fsd ft6, 96(a1) -; ILP32-NEXT: fsd ft5, 88(a1) -; ILP32-NEXT: fsd ft4, 80(a1) -; ILP32-NEXT: fsd ft3, 72(a1) -; ILP32-NEXT: fsd ft2, 64(a1) -; ILP32-NEXT: fsd ft1, 56(a1) -; ILP32-NEXT: fsd ft0, 48(a1) -; ILP32-NEXT: fsd fa0, 40(a1) -; ILP32-NEXT: fsd fa1, 32(a1) -; ILP32-NEXT: fsd fa2, 24(a1) -; ILP32-NEXT: fsd fa3, 16(a1) -; ILP32-NEXT: fsd fa4, %lo(var+8)(a0) -; ILP32-NEXT: fsd fa5, %lo(var)(a0) +; ILP32-NEXT: fsd fs8, 232(a1) +; ILP32-NEXT: fsd fs7, 224(a1) +; ILP32-NEXT: fsd fs6, 216(a1) +; ILP32-NEXT: fsd fs5, 208(a1) +; ILP32-NEXT: fsd fs4, 200(a1) +; ILP32-NEXT: fsd fs3, 192(a1) +; ILP32-NEXT: fsd fs2, 184(a1) +; ILP32-NEXT: fsd fs1, 176(a1) +; ILP32-NEXT: fsd fs0, 168(a1) +; ILP32-NEXT: fsd ft11, 160(a1) +; ILP32-NEXT: fsd ft10, 152(a1) +; ILP32-NEXT: fsd ft9, 144(a1) +; ILP32-NEXT: fsd ft8, 136(a1) +; ILP32-NEXT: fsd fa7, 128(a1) +; ILP32-NEXT: fsd fa6, 120(a1) +; ILP32-NEXT: fsd ft7, 112(a1) +; ILP32-NEXT: fsd ft6, 104(a1) +; ILP32-NEXT: fsd ft5, 96(a1) +; ILP32-NEXT: fsd ft4, 88(a1) +; ILP32-NEXT: fsd ft3, 80(a1) +; ILP32-NEXT: fsd ft2, 72(a1) +; ILP32-NEXT: fsd ft1, 64(a1) +; ILP32-NEXT: fsd ft0, 56(a1) +; ILP32-NEXT: fsd fa0, 48(a1) +; ILP32-NEXT: fsd fa1, 40(a1) +; ILP32-NEXT: fsd fa2, 32(a1) +; ILP32-NEXT: fsd fa3, 24(a1) +; ILP32-NEXT: fsd fa4, 16(a1) +; ILP32-NEXT: fsd fs11, %lo(var+8)(a0) +; ILP32-NEXT: fsd fs10, %lo(var)(a0) ; ILP32-NEXT: ret ; ; LP64-LABEL: callee: ; LP64: # %bb.0: ; LP64-NEXT: lui a0, %hi(var) -; LP64-NEXT: fld fa5, %lo(var)(a0) -; LP64-NEXT: fld fa4, %lo(var+8)(a0) ; LP64-NEXT: addi a1, a0, %lo(var) -; LP64-NEXT: fld fa3, 16(a1) -; LP64-NEXT: fld fa2, 24(a1) -; LP64-NEXT: fld fa1, 32(a1) -; LP64-NEXT: fld fa0, 40(a1) -; LP64-NEXT: fld ft0, 48(a1) -; LP64-NEXT: fld ft1, 56(a1) -; LP64-NEXT: fld ft2, 64(a1) -; LP64-NEXT: fld ft3, 72(a1) -; LP64-NEXT: fld ft4, 80(a1) -; LP64-NEXT: fld ft5, 88(a1) -; LP64-NEXT: fld ft6, 96(a1) -; LP64-NEXT: fld ft7, 104(a1) -; LP64-NEXT: fld fa6, 112(a1) -; LP64-NEXT: fld fa7, 120(a1) -; LP64-NEXT: fld ft8, 128(a1) -; LP64-NEXT: fld ft9, 136(a1) -; LP64-NEXT: fld ft10, 144(a1) -; LP64-NEXT: fld ft11, 152(a1) -; LP64-NEXT: fld fs0, 160(a1) -; LP64-NEXT: fld fs1, 168(a1) -; LP64-NEXT: fld fs2, 176(a1) -; LP64-NEXT: fld fs3, 184(a1) -; LP64-NEXT: fld fs4, 192(a1) -; LP64-NEXT: fld fs5, 200(a1) -; LP64-NEXT: fld fs6, 208(a1) -; LP64-NEXT: fld fs7, 216(a1) -; LP64-NEXT: fld fs8, 248(a1) +; LP64-NEXT: fld fa5, 248(a1) +; LP64-NEXT: fld fa4, 16(a1) +; LP64-NEXT: fld fa3, 24(a1) +; LP64-NEXT: fld fa2, 32(a1) +; LP64-NEXT: fld fa1, 40(a1) +; LP64-NEXT: fld fa0, 48(a1) +; LP64-NEXT: fld ft0, 56(a1) +; LP64-NEXT: fld ft1, 64(a1) +; LP64-NEXT: fld ft2, 72(a1) +; LP64-NEXT: fld ft3, 80(a1) +; LP64-NEXT: fld ft4, 88(a1) +; LP64-NEXT: fld ft5, 96(a1) +; LP64-NEXT: fld ft6, 104(a1) +; LP64-NEXT: fld ft7, 112(a1) +; LP64-NEXT: fld fa6, 120(a1) +; LP64-NEXT: fld fa7, 128(a1) +; LP64-NEXT: fld ft8, 136(a1) +; LP64-NEXT: fld ft9, 144(a1) +; LP64-NEXT: fld ft10, 152(a1) +; LP64-NEXT: fld ft11, 160(a1) +; LP64-NEXT: fld fs0, 168(a1) +; LP64-NEXT: fld fs1, 176(a1) +; LP64-NEXT: fld fs2, 184(a1) +; LP64-NEXT: fld fs3, 192(a1) +; LP64-NEXT: fld fs4, 200(a1) +; LP64-NEXT: fld fs5, 208(a1) +; LP64-NEXT: fld fs6, 216(a1) +; LP64-NEXT: fld fs7, 224(a1) +; LP64-NEXT: fld fs8, 232(a1) ; LP64-NEXT: fld fs9, 240(a1) -; LP64-NEXT: fld fs10, 232(a1) -; LP64-NEXT: fld fs11, 224(a1) -; LP64-NEXT: fsd fs8, 248(a1) +; LP64-NEXT: fld fs10, %lo(var)(a0) +; LP64-NEXT: fld fs11, %lo(var+8)(a0) +; LP64-NEXT: fsd fa5, 248(a1) ; LP64-NEXT: fsd fs9, 240(a1) -; LP64-NEXT: fsd fs10, 232(a1) -; LP64-NEXT: fsd fs11, 224(a1) -; LP64-NEXT: fsd fs7, 216(a1) -; LP64-NEXT: fsd fs6, 208(a1) -; LP64-NEXT: fsd fs5, 200(a1) -; LP64-NEXT: fsd fs4, 192(a1) -; LP64-NEXT: fsd fs3, 184(a1) -; LP64-NEXT: fsd fs2, 176(a1) -; LP64-NEXT: fsd fs1, 168(a1) -; LP64-NEXT: fsd fs0, 160(a1) -; LP64-NEXT: fsd ft11, 152(a1) -; LP64-NEXT: fsd ft10, 144(a1) -; LP64-NEXT: fsd ft9, 136(a1) -; LP64-NEXT: fsd ft8, 128(a1) -; LP64-NEXT: fsd fa7, 120(a1) -; LP64-NEXT: fsd fa6, 112(a1) -; LP64-NEXT: fsd ft7, 104(a1) -; LP64-NEXT: fsd ft6, 96(a1) -; LP64-NEXT: fsd ft5, 88(a1) -; LP64-NEXT: fsd ft4, 80(a1) -; LP64-NEXT: fsd ft3, 72(a1) -; LP64-NEXT: fsd ft2, 64(a1) -; LP64-NEXT: fsd ft1, 56(a1) -; LP64-NEXT: fsd ft0, 48(a1) -; LP64-NEXT: fsd fa0, 40(a1) -; LP64-NEXT: fsd fa1, 32(a1) -; LP64-NEXT: fsd fa2, 24(a1) -; LP64-NEXT: fsd fa3, 16(a1) -; LP64-NEXT: fsd fa4, %lo(var+8)(a0) -; LP64-NEXT: fsd fa5, %lo(var)(a0) +; LP64-NEXT: fsd fs8, 232(a1) +; LP64-NEXT: fsd fs7, 224(a1) +; LP64-NEXT: fsd fs6, 216(a1) +; LP64-NEXT: fsd fs5, 208(a1) +; LP64-NEXT: fsd fs4, 200(a1) +; LP64-NEXT: fsd fs3, 192(a1) +; LP64-NEXT: fsd fs2, 184(a1) +; LP64-NEXT: fsd fs1, 176(a1) +; LP64-NEXT: fsd fs0, 168(a1) +; LP64-NEXT: fsd ft11, 160(a1) +; LP64-NEXT: fsd ft10, 152(a1) +; LP64-NEXT: fsd ft9, 144(a1) +; LP64-NEXT: fsd ft8, 136(a1) +; LP64-NEXT: fsd fa7, 128(a1) +; LP64-NEXT: fsd fa6, 120(a1) +; LP64-NEXT: fsd ft7, 112(a1) +; LP64-NEXT: fsd ft6, 104(a1) +; LP64-NEXT: fsd ft5, 96(a1) +; LP64-NEXT: fsd ft4, 88(a1) +; LP64-NEXT: fsd ft3, 80(a1) +; LP64-NEXT: fsd ft2, 72(a1) +; LP64-NEXT: fsd ft1, 64(a1) +; LP64-NEXT: fsd ft0, 56(a1) +; LP64-NEXT: fsd fa0, 48(a1) +; LP64-NEXT: fsd fa1, 40(a1) +; LP64-NEXT: fsd fa2, 32(a1) +; LP64-NEXT: fsd fa3, 24(a1) +; LP64-NEXT: fsd fa4, 16(a1) +; LP64-NEXT: fsd fs11, %lo(var+8)(a0) +; LP64-NEXT: fsd fs10, %lo(var)(a0) ; LP64-NEXT: ret ; ; ILP32D-LABEL: callee: @@ -173,71 +173,71 @@ define void @callee() nounwind { ; ILP32D-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill ; ILP32D-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill ; ILP32D-NEXT: lui a0, %hi(var) -; ILP32D-NEXT: fld fa5, %lo(var)(a0) -; ILP32D-NEXT: fld fa4, %lo(var+8)(a0) ; ILP32D-NEXT: addi a1, a0, %lo(var) -; ILP32D-NEXT: fld fa3, 16(a1) -; ILP32D-NEXT: fld fa2, 24(a1) -; ILP32D-NEXT: fld fa1, 32(a1) -; ILP32D-NEXT: fld fa0, 40(a1) -; ILP32D-NEXT: fld ft0, 48(a1) -; ILP32D-NEXT: fld ft1, 56(a1) -; ILP32D-NEXT: fld ft2, 64(a1) -; ILP32D-NEXT: fld ft3, 72(a1) -; ILP32D-NEXT: fld ft4, 80(a1) -; ILP32D-NEXT: fld ft5, 88(a1) -; ILP32D-NEXT: fld ft6, 96(a1) -; ILP32D-NEXT: fld ft7, 104(a1) -; ILP32D-NEXT: fld fa6, 112(a1) -; ILP32D-NEXT: fld fa7, 120(a1) -; ILP32D-NEXT: fld ft8, 128(a1) -; ILP32D-NEXT: fld ft9, 136(a1) -; ILP32D-NEXT: fld ft10, 144(a1) -; ILP32D-NEXT: fld ft11, 152(a1) -; ILP32D-NEXT: fld fs0, 160(a1) -; ILP32D-NEXT: fld fs1, 168(a1) -; ILP32D-NEXT: fld fs2, 176(a1) -; ILP32D-NEXT: fld fs3, 184(a1) -; ILP32D-NEXT: fld fs4, 192(a1) -; ILP32D-NEXT: fld fs5, 200(a1) -; ILP32D-NEXT: fld fs6, 208(a1) -; ILP32D-NEXT: fld fs7, 216(a1) -; ILP32D-NEXT: fld fs8, 248(a1) +; ILP32D-NEXT: fld fa5, 248(a1) +; ILP32D-NEXT: fld fa4, 16(a1) +; ILP32D-NEXT: fld fa3, 24(a1) +; ILP32D-NEXT: fld fa2, 32(a1) +; ILP32D-NEXT: fld fa1, 40(a1) +; ILP32D-NEXT: fld fa0, 48(a1) +; ILP32D-NEXT: fld ft0, 56(a1) +; ILP32D-NEXT: fld ft1, 64(a1) +; ILP32D-NEXT: fld ft2, 72(a1) +; ILP32D-NEXT: fld ft3, 80(a1) +; ILP32D-NEXT: fld ft4, 88(a1) +; ILP32D-NEXT: fld ft5, 96(a1) +; ILP32D-NEXT: fld ft6, 104(a1) +; ILP32D-NEXT: fld ft7, 112(a1) +; ILP32D-NEXT: fld fa6, 120(a1) +; ILP32D-NEXT: fld fa7, 128(a1) +; ILP32D-NEXT: fld ft8, 136(a1) +; ILP32D-NEXT: fld ft9, 144(a1) +; ILP32D-NEXT: fld ft10, 152(a1) +; ILP32D-NEXT: fld ft11, 160(a1) +; ILP32D-NEXT: fld fs0, 168(a1) +; ILP32D-NEXT: fld fs1, 176(a1) +; ILP32D-NEXT: fld fs2, 184(a1) +; ILP32D-NEXT: fld fs3, 192(a1) +; ILP32D-NEXT: fld fs4, 200(a1) +; ILP32D-NEXT: fld fs5, 208(a1) +; ILP32D-NEXT: fld fs6, 216(a1) +; ILP32D-NEXT: fld fs7, 224(a1) +; ILP32D-NEXT: fld fs8, 232(a1) ; ILP32D-NEXT: fld fs9, 240(a1) -; ILP32D-NEXT: fld fs10, 232(a1) -; ILP32D-NEXT: fld fs11, 224(a1) -; ILP32D-NEXT: fsd fs8, 248(a1) +; ILP32D-NEXT: fld fs10, %lo(var)(a0) +; ILP32D-NEXT: fld fs11, %lo(var+8)(a0) +; ILP32D-NEXT: fsd fa5, 248(a1) ; ILP32D-NEXT: fsd fs9, 240(a1) -; ILP32D-NEXT: fsd fs10, 232(a1) -; ILP32D-NEXT: fsd fs11, 224(a1) -; ILP32D-NEXT: fsd fs7, 216(a1) -; ILP32D-NEXT: fsd fs6, 208(a1) -; ILP32D-NEXT: fsd fs5, 200(a1) -; ILP32D-NEXT: fsd fs4, 192(a1) -; ILP32D-NEXT: fsd fs3, 184(a1) -; ILP32D-NEXT: fsd fs2, 176(a1) -; ILP32D-NEXT: fsd fs1, 168(a1) -; ILP32D-NEXT: fsd fs0, 160(a1) -; ILP32D-NEXT: fsd ft11, 152(a1) -; ILP32D-NEXT: fsd ft10, 144(a1) -; ILP32D-NEXT: fsd ft9, 136(a1) -; ILP32D-NEXT: fsd ft8, 128(a1) -; ILP32D-NEXT: fsd fa7, 120(a1) -; ILP32D-NEXT: fsd fa6, 112(a1) -; ILP32D-NEXT: fsd ft7, 104(a1) -; ILP32D-NEXT: fsd ft6, 96(a1) -; ILP32D-NEXT: fsd ft5, 88(a1) -; ILP32D-NEXT: fsd ft4, 80(a1) -; ILP32D-NEXT: fsd ft3, 72(a1) -; ILP32D-NEXT: fsd ft2, 64(a1) -; ILP32D-NEXT: fsd ft1, 56(a1) -; ILP32D-NEXT: fsd ft0, 48(a1) -; ILP32D-NEXT: fsd fa0, 40(a1) -; ILP32D-NEXT: fsd fa1, 32(a1) -; ILP32D-NEXT: fsd fa2, 24(a1) -; ILP32D-NEXT: fsd fa3, 16(a1) -; ILP32D-NEXT: fsd fa4, %lo(var+8)(a0) -; ILP32D-NEXT: fsd fa5, %lo(var)(a0) +; ILP32D-NEXT: fsd fs8, 232(a1) +; ILP32D-NEXT: fsd fs7, 224(a1) +; ILP32D-NEXT: fsd fs6, 216(a1) +; ILP32D-NEXT: fsd fs5, 208(a1) +; ILP32D-NEXT: fsd fs4, 200(a1) +; ILP32D-NEXT: fsd fs3, 192(a1) +; ILP32D-NEXT: fsd fs2, 184(a1) +; ILP32D-NEXT: fsd fs1, 176(a1) +; ILP32D-NEXT: fsd fs0, 168(a1) +; ILP32D-NEXT: fsd ft11, 160(a1) +; ILP32D-NEXT: fsd ft10, 152(a1) +; ILP32D-NEXT: fsd ft9, 144(a1) +; ILP32D-NEXT: fsd ft8, 136(a1) +; ILP32D-NEXT: fsd fa7, 128(a1) +; ILP32D-NEXT: fsd fa6, 120(a1) +; ILP32D-NEXT: fsd ft7, 112(a1) +; ILP32D-NEXT: fsd ft6, 104(a1) +; ILP32D-NEXT: fsd ft5, 96(a1) +; ILP32D-NEXT: fsd ft4, 88(a1) +; ILP32D-NEXT: fsd ft3, 80(a1) +; ILP32D-NEXT: fsd ft2, 72(a1) +; ILP32D-NEXT: fsd ft1, 64(a1) +; ILP32D-NEXT: fsd ft0, 56(a1) +; ILP32D-NEXT: fsd fa0, 48(a1) +; ILP32D-NEXT: fsd fa1, 40(a1) +; ILP32D-NEXT: fsd fa2, 32(a1) +; ILP32D-NEXT: fsd fa3, 24(a1) +; ILP32D-NEXT: fsd fa4, 16(a1) +; ILP32D-NEXT: fsd fs11, %lo(var+8)(a0) +; ILP32D-NEXT: fsd fs10, %lo(var)(a0) ; ILP32D-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload ; ILP32D-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload ; ILP32D-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload @@ -269,71 +269,71 @@ define void @callee() nounwind { ; LP64D-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill ; LP64D-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill ; LP64D-NEXT: lui a0, %hi(var) -; LP64D-NEXT: fld fa5, %lo(var)(a0) -; LP64D-NEXT: fld fa4, %lo(var+8)(a0) ; LP64D-NEXT: addi a1, a0, %lo(var) -; LP64D-NEXT: fld fa3, 16(a1) -; LP64D-NEXT: fld fa2, 24(a1) -; LP64D-NEXT: fld fa1, 32(a1) -; LP64D-NEXT: fld fa0, 40(a1) -; LP64D-NEXT: fld ft0, 48(a1) -; LP64D-NEXT: fld ft1, 56(a1) -; LP64D-NEXT: fld ft2, 64(a1) -; LP64D-NEXT: fld ft3, 72(a1) -; LP64D-NEXT: fld ft4, 80(a1) -; LP64D-NEXT: fld ft5, 88(a1) -; LP64D-NEXT: fld ft6, 96(a1) -; LP64D-NEXT: fld ft7, 104(a1) -; LP64D-NEXT: fld fa6, 112(a1) -; LP64D-NEXT: fld fa7, 120(a1) -; LP64D-NEXT: fld ft8, 128(a1) -; LP64D-NEXT: fld ft9, 136(a1) -; LP64D-NEXT: fld ft10, 144(a1) -; LP64D-NEXT: fld ft11, 152(a1) -; LP64D-NEXT: fld fs0, 160(a1) -; LP64D-NEXT: fld fs1, 168(a1) -; LP64D-NEXT: fld fs2, 176(a1) -; LP64D-NEXT: fld fs3, 184(a1) -; LP64D-NEXT: fld fs4, 192(a1) -; LP64D-NEXT: fld fs5, 200(a1) -; LP64D-NEXT: fld fs6, 208(a1) -; LP64D-NEXT: fld fs7, 216(a1) -; LP64D-NEXT: fld fs8, 248(a1) +; LP64D-NEXT: fld fa5, 248(a1) +; LP64D-NEXT: fld fa4, 16(a1) +; LP64D-NEXT: fld fa3, 24(a1) +; LP64D-NEXT: fld fa2, 32(a1) +; LP64D-NEXT: fld fa1, 40(a1) +; LP64D-NEXT: fld fa0, 48(a1) +; LP64D-NEXT: fld ft0, 56(a1) +; LP64D-NEXT: fld ft1, 64(a1) +; LP64D-NEXT: fld ft2, 72(a1) +; LP64D-NEXT: fld ft3, 80(a1) +; LP64D-NEXT: fld ft4, 88(a1) +; LP64D-NEXT: fld ft5, 96(a1) +; LP64D-NEXT: fld ft6, 104(a1) +; LP64D-NEXT: fld ft7, 112(a1) +; LP64D-NEXT: fld fa6, 120(a1) +; LP64D-NEXT: fld fa7, 128(a1) +; LP64D-NEXT: fld ft8, 136(a1) +; LP64D-NEXT: fld ft9, 144(a1) +; LP64D-NEXT: fld ft10, 152(a1) +; LP64D-NEXT: fld ft11, 160(a1) +; LP64D-NEXT: fld fs0, 168(a1) +; LP64D-NEXT: fld fs1, 176(a1) +; LP64D-NEXT: fld fs2, 184(a1) +; LP64D-NEXT: fld fs3, 192(a1) +; LP64D-NEXT: fld fs4, 200(a1) +; LP64D-NEXT: fld fs5, 208(a1) +; LP64D-NEXT: fld fs6, 216(a1) +; LP64D-NEXT: fld fs7, 224(a1) +; LP64D-NEXT: fld fs8, 232(a1) ; LP64D-NEXT: fld fs9, 240(a1) -; LP64D-NEXT: fld fs10, 232(a1) -; LP64D-NEXT: fld fs11, 224(a1) -; LP64D-NEXT: fsd fs8, 248(a1) +; LP64D-NEXT: fld fs10, %lo(var)(a0) +; LP64D-NEXT: fld fs11, %lo(var+8)(a0) +; LP64D-NEXT: fsd fa5, 248(a1) ; LP64D-NEXT: fsd fs9, 240(a1) -; LP64D-NEXT: fsd fs10, 232(a1) -; LP64D-NEXT: fsd fs11, 224(a1) -; LP64D-NEXT: fsd fs7, 216(a1) -; LP64D-NEXT: fsd fs6, 208(a1) -; LP64D-NEXT: fsd fs5, 200(a1) -; LP64D-NEXT: fsd fs4, 192(a1) -; LP64D-NEXT: fsd fs3, 184(a1) -; LP64D-NEXT: fsd fs2, 176(a1) -; LP64D-NEXT: fsd fs1, 168(a1) -; LP64D-NEXT: fsd fs0, 160(a1) -; LP64D-NEXT: fsd ft11, 152(a1) -; LP64D-NEXT: fsd ft10, 144(a1) -; LP64D-NEXT: fsd ft9, 136(a1) -; LP64D-NEXT: fsd ft8, 128(a1) -; LP64D-NEXT: fsd fa7, 120(a1) -; LP64D-NEXT: fsd fa6, 112(a1) -; LP64D-NEXT: fsd ft7, 104(a1) -; LP64D-NEXT: fsd ft6, 96(a1) -; LP64D-NEXT: fsd ft5, 88(a1) -; LP64D-NEXT: fsd ft4, 80(a1) -; LP64D-NEXT: fsd ft3, 72(a1) -; LP64D-NEXT: fsd ft2, 64(a1) -; LP64D-NEXT: fsd ft1, 56(a1) -; LP64D-NEXT: fsd ft0, 48(a1) -; LP64D-NEXT: fsd fa0, 40(a1) -; LP64D-NEXT: fsd fa1, 32(a1) -; LP64D-NEXT: fsd fa2, 24(a1) -; LP64D-NEXT: fsd fa3, 16(a1) -; LP64D-NEXT: fsd fa4, %lo(var+8)(a0) -; LP64D-NEXT: fsd fa5, %lo(var)(a0) +; LP64D-NEXT: fsd fs8, 232(a1) +; LP64D-NEXT: fsd fs7, 224(a1) +; LP64D-NEXT: fsd fs6, 216(a1) +; LP64D-NEXT: fsd fs5, 208(a1) +; LP64D-NEXT: fsd fs4, 200(a1) +; LP64D-NEXT: fsd fs3, 192(a1) +; LP64D-NEXT: fsd fs2, 184(a1) +; LP64D-NEXT: fsd fs1, 176(a1) +; LP64D-NEXT: fsd fs0, 168(a1) +; LP64D-NEXT: fsd ft11, 160(a1) +; LP64D-NEXT: fsd ft10, 152(a1) +; LP64D-NEXT: fsd ft9, 144(a1) +; LP64D-NEXT: fsd ft8, 136(a1) +; LP64D-NEXT: fsd fa7, 128(a1) +; LP64D-NEXT: fsd fa6, 120(a1) +; LP64D-NEXT: fsd ft7, 112(a1) +; LP64D-NEXT: fsd ft6, 104(a1) +; LP64D-NEXT: fsd ft5, 96(a1) +; LP64D-NEXT: fsd ft4, 88(a1) +; LP64D-NEXT: fsd ft3, 80(a1) +; LP64D-NEXT: fsd ft2, 72(a1) +; LP64D-NEXT: fsd ft1, 64(a1) +; LP64D-NEXT: fsd ft0, 56(a1) +; LP64D-NEXT: fsd fa0, 48(a1) +; LP64D-NEXT: fsd fa1, 40(a1) +; LP64D-NEXT: fsd fa2, 32(a1) +; LP64D-NEXT: fsd fa3, 24(a1) +; LP64D-NEXT: fsd fa4, 16(a1) +; LP64D-NEXT: fsd fs11, %lo(var+8)(a0) +; LP64D-NEXT: fsd fs10, %lo(var)(a0) ; LP64D-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload ; LP64D-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload ; LP64D-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index 09ecbbc7e8feb..a8ca5d02ff78d 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -50,84 +50,84 @@ define void @callee() nounwind { ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var) -; RV32I-NEXT: lw a0, %lo(var)(a6) +; RV32I-NEXT: lui a4, %hi(var) +; RV32I-NEXT: lw a0, %lo(var)(a4) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+4)(a6) +; RV32I-NEXT: addi a2, a4, %lo(var) +; RV32I-NEXT: lw a0, 16(a2) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+8)(a6) +; RV32I-NEXT: lw a0, 20(a2) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+12)(a6) +; RV32I-NEXT: lw a0, 24(a2) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var) -; RV32I-NEXT: lw a0, 16(a5) +; RV32I-NEXT: lw a0, 28(a2) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, 20(a5) +; RV32I-NEXT: lw a0, 32(a2) ; RV32I-NEXT: sw a0, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw t0, 24(a5) -; RV32I-NEXT: lw t1, 28(a5) -; RV32I-NEXT: lw t2, 32(a5) -; RV32I-NEXT: lw t3, 36(a5) -; RV32I-NEXT: lw t4, 40(a5) -; RV32I-NEXT: lw t5, 44(a5) -; RV32I-NEXT: lw t6, 48(a5) -; RV32I-NEXT: lw s0, 52(a5) -; RV32I-NEXT: lw s1, 56(a5) -; RV32I-NEXT: lw s2, 60(a5) -; RV32I-NEXT: lw s3, 64(a5) -; RV32I-NEXT: lw s4, 68(a5) -; RV32I-NEXT: lw s5, 72(a5) -; RV32I-NEXT: lw s6, 76(a5) -; RV32I-NEXT: lw s7, 80(a5) -; RV32I-NEXT: lw s8, 84(a5) -; RV32I-NEXT: lw s9, 88(a5) -; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 96(a5) -; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) -; RV32I-NEXT: lw a4, 108(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a1, 120(a5) -; RV32I-NEXT: lw a2, 116(a5) -; RV32I-NEXT: lw a3, 112(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a1, 120(a5) -; RV32I-NEXT: sw a2, 116(a5) -; RV32I-NEXT: sw a3, 112(a5) -; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) -; RV32I-NEXT: sw ra, 100(a5) -; RV32I-NEXT: sw s11, 96(a5) -; RV32I-NEXT: sw s10, 92(a5) -; RV32I-NEXT: sw s9, 88(a5) -; RV32I-NEXT: sw s8, 84(a5) -; RV32I-NEXT: sw s7, 80(a5) -; RV32I-NEXT: sw s6, 76(a5) -; RV32I-NEXT: sw s5, 72(a5) -; RV32I-NEXT: sw s4, 68(a5) -; RV32I-NEXT: sw s3, 64(a5) -; RV32I-NEXT: sw s2, 60(a5) -; RV32I-NEXT: sw s1, 56(a5) -; RV32I-NEXT: sw s0, 52(a5) -; RV32I-NEXT: sw t6, 48(a5) -; RV32I-NEXT: sw t5, 44(a5) -; RV32I-NEXT: sw t4, 40(a5) -; RV32I-NEXT: sw t3, 36(a5) -; RV32I-NEXT: sw t2, 32(a5) -; RV32I-NEXT: sw t1, 28(a5) -; RV32I-NEXT: sw t0, 24(a5) +; RV32I-NEXT: lw t0, 36(a2) +; RV32I-NEXT: lw t1, 40(a2) +; RV32I-NEXT: lw t2, 44(a2) +; RV32I-NEXT: lw t3, 48(a2) +; RV32I-NEXT: lw t4, 52(a2) +; RV32I-NEXT: lw t5, 56(a2) +; RV32I-NEXT: lw t6, 60(a2) +; RV32I-NEXT: lw s0, 64(a2) +; RV32I-NEXT: lw s1, 68(a2) +; RV32I-NEXT: lw s2, 72(a2) +; RV32I-NEXT: lw s3, 76(a2) +; RV32I-NEXT: lw s4, 80(a2) +; RV32I-NEXT: lw s5, 84(a2) +; RV32I-NEXT: lw s6, 88(a2) +; RV32I-NEXT: lw s7, 92(a2) +; RV32I-NEXT: lw s8, 96(a2) +; RV32I-NEXT: lw s9, 100(a2) +; RV32I-NEXT: lw s10, 104(a2) +; RV32I-NEXT: lw s11, 108(a2) +; RV32I-NEXT: lw ra, 112(a2) +; RV32I-NEXT: lw a3, 116(a2) +; RV32I-NEXT: lw a1, 120(a2) +; RV32I-NEXT: lw a0, 124(a2) +; RV32I-NEXT: lw a7, %lo(var+4)(a4) +; RV32I-NEXT: lw a6, %lo(var+8)(a4) +; RV32I-NEXT: lw a5, %lo(var+12)(a4) +; RV32I-NEXT: sw a0, 124(a2) +; RV32I-NEXT: sw a1, 120(a2) +; RV32I-NEXT: sw a3, 116(a2) +; RV32I-NEXT: sw ra, 112(a2) +; RV32I-NEXT: sw s11, 108(a2) +; RV32I-NEXT: sw s10, 104(a2) +; RV32I-NEXT: sw s9, 100(a2) +; RV32I-NEXT: sw s8, 96(a2) +; RV32I-NEXT: sw s7, 92(a2) +; RV32I-NEXT: sw s6, 88(a2) +; RV32I-NEXT: sw s5, 84(a2) +; RV32I-NEXT: sw s4, 80(a2) +; RV32I-NEXT: sw s3, 76(a2) +; RV32I-NEXT: sw s2, 72(a2) +; RV32I-NEXT: sw s1, 68(a2) +; RV32I-NEXT: sw s0, 64(a2) +; RV32I-NEXT: sw t6, 60(a2) +; RV32I-NEXT: sw t5, 56(a2) +; RV32I-NEXT: sw t4, 52(a2) +; RV32I-NEXT: sw t3, 48(a2) +; RV32I-NEXT: sw t2, 44(a2) +; RV32I-NEXT: sw t1, 40(a2) +; RV32I-NEXT: sw t0, 36(a2) ; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 20(a5) +; RV32I-NEXT: sw a0, 32(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 16(a5) +; RV32I-NEXT: sw a0, 28(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+12)(a6) +; RV32I-NEXT: sw a0, 24(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+8)(a6) +; RV32I-NEXT: sw a0, 20(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+4)(a6) +; RV32I-NEXT: sw a0, 16(a2) +; RV32I-NEXT: sw a5, %lo(var+12)(a4) +; RV32I-NEXT: sw a6, %lo(var+8)(a4) +; RV32I-NEXT: sw a7, %lo(var+4)(a4) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var)(a6) +; RV32I-NEXT: sw a0, %lo(var)(a4) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -161,86 +161,86 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: addi s0, sp, 80 -; RV32I-WITH-FP-NEXT: lui a6, %hi(var) -; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV32I-WITH-FP-NEXT: lui a5, %hi(var) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a5) ; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV32I-WITH-FP-NEXT: addi a2, a5, %lo(var) +; RV32I-WITH-FP-NEXT: lw a0, 16(a2) ; RV32I-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV32I-WITH-FP-NEXT: lw a0, 20(a2) ; RV32I-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV32I-WITH-FP-NEXT: lw a0, 24(a2) ; RV32I-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: addi a5, a6, %lo(var) -; RV32I-WITH-FP-NEXT: lw a0, 16(a5) +; RV32I-WITH-FP-NEXT: lw a0, 28(a2) ; RV32I-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, 20(a5) +; RV32I-WITH-FP-NEXT: lw a0, 32(a2) ; RV32I-WITH-FP-NEXT: sw a0, -76(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, 24(a5) +; RV32I-WITH-FP-NEXT: lw a0, 36(a2) ; RV32I-WITH-FP-NEXT: sw a0, -80(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw t1, 28(a5) -; RV32I-WITH-FP-NEXT: lw t2, 32(a5) -; RV32I-WITH-FP-NEXT: lw t3, 36(a5) -; RV32I-WITH-FP-NEXT: lw t4, 40(a5) -; RV32I-WITH-FP-NEXT: lw t5, 44(a5) -; RV32I-WITH-FP-NEXT: lw t6, 48(a5) -; RV32I-WITH-FP-NEXT: lw s1, 52(a5) -; RV32I-WITH-FP-NEXT: lw s2, 56(a5) -; RV32I-WITH-FP-NEXT: lw s3, 60(a5) -; RV32I-WITH-FP-NEXT: lw s4, 64(a5) -; RV32I-WITH-FP-NEXT: lw s5, 68(a5) -; RV32I-WITH-FP-NEXT: lw s6, 72(a5) -; RV32I-WITH-FP-NEXT: lw s7, 76(a5) -; RV32I-WITH-FP-NEXT: lw s8, 80(a5) -; RV32I-WITH-FP-NEXT: lw s9, 84(a5) -; RV32I-WITH-FP-NEXT: lw s10, 88(a5) -; RV32I-WITH-FP-NEXT: lw s11, 92(a5) -; RV32I-WITH-FP-NEXT: lw ra, 96(a5) -; RV32I-WITH-FP-NEXT: lw t0, 100(a5) -; RV32I-WITH-FP-NEXT: lw a7, 104(a5) -; RV32I-WITH-FP-NEXT: lw a4, 108(a5) -; RV32I-WITH-FP-NEXT: lw a0, 124(a5) -; RV32I-WITH-FP-NEXT: lw a1, 120(a5) -; RV32I-WITH-FP-NEXT: lw a2, 116(a5) -; RV32I-WITH-FP-NEXT: lw a3, 112(a5) -; RV32I-WITH-FP-NEXT: sw a0, 124(a5) -; RV32I-WITH-FP-NEXT: sw a1, 120(a5) -; RV32I-WITH-FP-NEXT: sw a2, 116(a5) -; RV32I-WITH-FP-NEXT: sw a3, 112(a5) -; RV32I-WITH-FP-NEXT: sw a4, 108(a5) -; RV32I-WITH-FP-NEXT: sw a7, 104(a5) -; RV32I-WITH-FP-NEXT: sw t0, 100(a5) -; RV32I-WITH-FP-NEXT: sw ra, 96(a5) -; RV32I-WITH-FP-NEXT: sw s11, 92(a5) -; RV32I-WITH-FP-NEXT: sw s10, 88(a5) -; RV32I-WITH-FP-NEXT: sw s9, 84(a5) -; RV32I-WITH-FP-NEXT: sw s8, 80(a5) -; RV32I-WITH-FP-NEXT: sw s7, 76(a5) -; RV32I-WITH-FP-NEXT: sw s6, 72(a5) -; RV32I-WITH-FP-NEXT: sw s5, 68(a5) -; RV32I-WITH-FP-NEXT: sw s4, 64(a5) -; RV32I-WITH-FP-NEXT: sw s3, 60(a5) -; RV32I-WITH-FP-NEXT: sw s2, 56(a5) -; RV32I-WITH-FP-NEXT: sw s1, 52(a5) -; RV32I-WITH-FP-NEXT: sw t6, 48(a5) -; RV32I-WITH-FP-NEXT: sw t5, 44(a5) -; RV32I-WITH-FP-NEXT: sw t4, 40(a5) -; RV32I-WITH-FP-NEXT: sw t3, 36(a5) -; RV32I-WITH-FP-NEXT: sw t2, 32(a5) -; RV32I-WITH-FP-NEXT: sw t1, 28(a5) +; RV32I-WITH-FP-NEXT: lw t1, 40(a2) +; RV32I-WITH-FP-NEXT: lw t2, 44(a2) +; RV32I-WITH-FP-NEXT: lw t3, 48(a2) +; RV32I-WITH-FP-NEXT: lw t4, 52(a2) +; RV32I-WITH-FP-NEXT: lw t5, 56(a2) +; RV32I-WITH-FP-NEXT: lw t6, 60(a2) +; RV32I-WITH-FP-NEXT: lw s1, 64(a2) +; RV32I-WITH-FP-NEXT: lw s2, 68(a2) +; RV32I-WITH-FP-NEXT: lw s3, 72(a2) +; RV32I-WITH-FP-NEXT: lw s4, 76(a2) +; RV32I-WITH-FP-NEXT: lw s5, 80(a2) +; RV32I-WITH-FP-NEXT: lw s6, 84(a2) +; RV32I-WITH-FP-NEXT: lw s7, 88(a2) +; RV32I-WITH-FP-NEXT: lw s8, 92(a2) +; RV32I-WITH-FP-NEXT: lw s9, 96(a2) +; RV32I-WITH-FP-NEXT: lw s10, 100(a2) +; RV32I-WITH-FP-NEXT: lw s11, 104(a2) +; RV32I-WITH-FP-NEXT: lw ra, 108(a2) +; RV32I-WITH-FP-NEXT: lw a4, 112(a2) +; RV32I-WITH-FP-NEXT: lw a3, 116(a2) +; RV32I-WITH-FP-NEXT: lw a1, 120(a2) +; RV32I-WITH-FP-NEXT: lw a0, 124(a2) +; RV32I-WITH-FP-NEXT: lw t0, %lo(var+4)(a5) +; RV32I-WITH-FP-NEXT: lw a7, %lo(var+8)(a5) +; RV32I-WITH-FP-NEXT: lw a6, %lo(var+12)(a5) +; RV32I-WITH-FP-NEXT: sw a0, 124(a2) +; RV32I-WITH-FP-NEXT: sw a1, 120(a2) +; RV32I-WITH-FP-NEXT: sw a3, 116(a2) +; RV32I-WITH-FP-NEXT: sw a4, 112(a2) +; RV32I-WITH-FP-NEXT: sw ra, 108(a2) +; RV32I-WITH-FP-NEXT: sw s11, 104(a2) +; RV32I-WITH-FP-NEXT: sw s10, 100(a2) +; RV32I-WITH-FP-NEXT: sw s9, 96(a2) +; RV32I-WITH-FP-NEXT: sw s8, 92(a2) +; RV32I-WITH-FP-NEXT: sw s7, 88(a2) +; RV32I-WITH-FP-NEXT: sw s6, 84(a2) +; RV32I-WITH-FP-NEXT: sw s5, 80(a2) +; RV32I-WITH-FP-NEXT: sw s4, 76(a2) +; RV32I-WITH-FP-NEXT: sw s3, 72(a2) +; RV32I-WITH-FP-NEXT: sw s2, 68(a2) +; RV32I-WITH-FP-NEXT: sw s1, 64(a2) +; RV32I-WITH-FP-NEXT: sw t6, 60(a2) +; RV32I-WITH-FP-NEXT: sw t5, 56(a2) +; RV32I-WITH-FP-NEXT: sw t4, 52(a2) +; RV32I-WITH-FP-NEXT: sw t3, 48(a2) +; RV32I-WITH-FP-NEXT: sw t2, 44(a2) +; RV32I-WITH-FP-NEXT: sw t1, 40(a2) ; RV32I-WITH-FP-NEXT: lw a0, -80(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, 24(a5) +; RV32I-WITH-FP-NEXT: sw a0, 36(a2) ; RV32I-WITH-FP-NEXT: lw a0, -76(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, 20(a5) +; RV32I-WITH-FP-NEXT: sw a0, 32(a2) ; RV32I-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, 16(a5) +; RV32I-WITH-FP-NEXT: sw a0, 28(a2) ; RV32I-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV32I-WITH-FP-NEXT: sw a0, 24(a2) ; RV32I-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV32I-WITH-FP-NEXT: sw a0, 20(a2) ; RV32I-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV32I-WITH-FP-NEXT: sw a0, 16(a2) +; RV32I-WITH-FP-NEXT: sw a6, %lo(var+12)(a5) +; RV32I-WITH-FP-NEXT: sw a7, %lo(var+8)(a5) +; RV32I-WITH-FP-NEXT: sw t0, %lo(var+4)(a5) ; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a5) ; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -260,84 +260,84 @@ define void @callee() nounwind { ; RV32IZCMP-LABEL: callee: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a6, %hi(var) -; RV32IZCMP-NEXT: lw a0, %lo(var)(a6) +; RV32IZCMP-NEXT: lui a5, %hi(var) +; RV32IZCMP-NEXT: lw a0, %lo(var)(a5) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a6) +; RV32IZCMP-NEXT: addi a2, a5, %lo(var) +; RV32IZCMP-NEXT: lw a0, 16(a2) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a6) +; RV32IZCMP-NEXT: lw a0, 20(a2) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a6) +; RV32IZCMP-NEXT: lw a0, 24(a2) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var) -; RV32IZCMP-NEXT: lw a0, 16(a5) +; RV32IZCMP-NEXT: lw a0, 28(a2) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, 20(a5) +; RV32IZCMP-NEXT: lw a0, 32(a2) ; RV32IZCMP-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw t4, 24(a5) -; RV32IZCMP-NEXT: lw t5, 28(a5) -; RV32IZCMP-NEXT: lw t6, 32(a5) -; RV32IZCMP-NEXT: lw s2, 36(a5) -; RV32IZCMP-NEXT: lw s3, 40(a5) -; RV32IZCMP-NEXT: lw s4, 44(a5) -; RV32IZCMP-NEXT: lw s5, 48(a5) -; RV32IZCMP-NEXT: lw s6, 52(a5) -; RV32IZCMP-NEXT: lw s7, 56(a5) -; RV32IZCMP-NEXT: lw s8, 60(a5) -; RV32IZCMP-NEXT: lw s9, 64(a5) -; RV32IZCMP-NEXT: lw s10, 68(a5) -; RV32IZCMP-NEXT: lw s11, 72(a5) -; RV32IZCMP-NEXT: lw ra, 76(a5) -; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw t0, 96(a5) -; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) -; RV32IZCMP-NEXT: lw a4, 108(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a1, 120(a5) -; RV32IZCMP-NEXT: lw a2, 116(a5) -; RV32IZCMP-NEXT: lw a3, 112(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a1, 120(a5) -; RV32IZCMP-NEXT: sw a2, 116(a5) -; RV32IZCMP-NEXT: sw a3, 112(a5) -; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) -; RV32IZCMP-NEXT: sw s0, 100(a5) -; RV32IZCMP-NEXT: sw t0, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) -; RV32IZCMP-NEXT: sw s1, 80(a5) -; RV32IZCMP-NEXT: sw ra, 76(a5) -; RV32IZCMP-NEXT: sw s11, 72(a5) -; RV32IZCMP-NEXT: sw s10, 68(a5) -; RV32IZCMP-NEXT: sw s9, 64(a5) -; RV32IZCMP-NEXT: sw s8, 60(a5) -; RV32IZCMP-NEXT: sw s7, 56(a5) -; RV32IZCMP-NEXT: sw s6, 52(a5) -; RV32IZCMP-NEXT: sw s5, 48(a5) -; RV32IZCMP-NEXT: sw s4, 44(a5) -; RV32IZCMP-NEXT: sw s3, 40(a5) -; RV32IZCMP-NEXT: sw s2, 36(a5) -; RV32IZCMP-NEXT: sw t6, 32(a5) -; RV32IZCMP-NEXT: sw t5, 28(a5) -; RV32IZCMP-NEXT: sw t4, 24(a5) +; RV32IZCMP-NEXT: lw t4, 36(a2) +; RV32IZCMP-NEXT: lw t5, 40(a2) +; RV32IZCMP-NEXT: lw t6, 44(a2) +; RV32IZCMP-NEXT: lw s2, 48(a2) +; RV32IZCMP-NEXT: lw s3, 52(a2) +; RV32IZCMP-NEXT: lw s4, 56(a2) +; RV32IZCMP-NEXT: lw s5, 60(a2) +; RV32IZCMP-NEXT: lw s6, 64(a2) +; RV32IZCMP-NEXT: lw s7, 68(a2) +; RV32IZCMP-NEXT: lw s8, 72(a2) +; RV32IZCMP-NEXT: lw s9, 76(a2) +; RV32IZCMP-NEXT: lw s10, 80(a2) +; RV32IZCMP-NEXT: lw s11, 84(a2) +; RV32IZCMP-NEXT: lw ra, 88(a2) +; RV32IZCMP-NEXT: lw s1, 92(a2) +; RV32IZCMP-NEXT: lw t0, 96(a2) +; RV32IZCMP-NEXT: lw a7, 100(a2) +; RV32IZCMP-NEXT: lw a6, 104(a2) +; RV32IZCMP-NEXT: lw a4, 108(a2) +; RV32IZCMP-NEXT: lw s0, 112(a2) +; RV32IZCMP-NEXT: lw a3, 116(a2) +; RV32IZCMP-NEXT: lw a1, 120(a2) +; RV32IZCMP-NEXT: lw a0, 124(a2) +; RV32IZCMP-NEXT: lw t3, %lo(var+4)(a5) +; RV32IZCMP-NEXT: lw t2, %lo(var+8)(a5) +; RV32IZCMP-NEXT: lw t1, %lo(var+12)(a5) +; RV32IZCMP-NEXT: sw a0, 124(a2) +; RV32IZCMP-NEXT: sw a1, 120(a2) +; RV32IZCMP-NEXT: sw a3, 116(a2) +; RV32IZCMP-NEXT: sw s0, 112(a2) +; RV32IZCMP-NEXT: sw a4, 108(a2) +; RV32IZCMP-NEXT: sw a6, 104(a2) +; RV32IZCMP-NEXT: sw a7, 100(a2) +; RV32IZCMP-NEXT: sw t0, 96(a2) +; RV32IZCMP-NEXT: sw s1, 92(a2) +; RV32IZCMP-NEXT: sw ra, 88(a2) +; RV32IZCMP-NEXT: sw s11, 84(a2) +; RV32IZCMP-NEXT: sw s10, 80(a2) +; RV32IZCMP-NEXT: sw s9, 76(a2) +; RV32IZCMP-NEXT: sw s8, 72(a2) +; RV32IZCMP-NEXT: sw s7, 68(a2) +; RV32IZCMP-NEXT: sw s6, 64(a2) +; RV32IZCMP-NEXT: sw s5, 60(a2) +; RV32IZCMP-NEXT: sw s4, 56(a2) +; RV32IZCMP-NEXT: sw s3, 52(a2) +; RV32IZCMP-NEXT: sw s2, 48(a2) +; RV32IZCMP-NEXT: sw t6, 44(a2) +; RV32IZCMP-NEXT: sw t5, 40(a2) +; RV32IZCMP-NEXT: sw t4, 36(a2) ; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 20(a5) +; RV32IZCMP-NEXT: sw a0, 32(a2) ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 16(a5) +; RV32IZCMP-NEXT: sw a0, 28(a2) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a6) +; RV32IZCMP-NEXT: sw a0, 24(a2) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a6) +; RV32IZCMP-NEXT: sw a0, 20(a2) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a6) +; RV32IZCMP-NEXT: sw a0, 16(a2) +; RV32IZCMP-NEXT: sw t1, %lo(var+12)(a5) +; RV32IZCMP-NEXT: sw t2, %lo(var+8)(a5) +; RV32IZCMP-NEXT: sw t3, %lo(var+4)(a5) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var)(a5) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV32IZCMP-WITH-FP-LABEL: callee: @@ -360,81 +360,81 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lui a6, %hi(var) ; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV32IZCMP-WITH-FP-NEXT: addi a2, a6, %lo(var) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 24(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) -; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 28(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 32(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -76(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, 24(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 36(a2) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -80(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw t5, 28(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t6, 32(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s2, 36(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s3, 40(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s4, 44(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s5, 48(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s6, 52(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s7, 56(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s8, 60(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s9, 64(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s10, 68(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s11, 72(a5) -; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t4, 80(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t3, 84(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t2, 88(a5) -; RV32IZCMP-WITH-FP-NEXT: lw s1, 92(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t1, 96(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a7, 104(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a4, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a1, 120(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a2, 116(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a3, 112(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a0, 124(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a1, 120(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a2, 116(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a3, 112(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a7, 104(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t0, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t1, 96(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s1, 92(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t2, 88(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t3, 84(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t4, 80(a5) -; RV32IZCMP-WITH-FP-NEXT: sw ra, 76(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s11, 72(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s10, 68(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s9, 64(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s8, 60(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s7, 56(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s6, 52(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s5, 48(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s4, 44(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s3, 40(a5) -; RV32IZCMP-WITH-FP-NEXT: sw s2, 36(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t6, 32(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t5, 28(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t5, 40(a2) +; RV32IZCMP-WITH-FP-NEXT: lw t6, 44(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s2, 48(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s3, 52(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s4, 56(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s5, 60(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s6, 64(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s7, 68(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s8, 72(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s9, 76(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s10, 80(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s11, 84(a2) +; RV32IZCMP-WITH-FP-NEXT: lw ra, 88(a2) +; RV32IZCMP-WITH-FP-NEXT: lw t1, 92(a2) +; RV32IZCMP-WITH-FP-NEXT: lw t0, 96(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a7, 100(a2) +; RV32IZCMP-WITH-FP-NEXT: lw s1, 104(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a5, 108(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a4, 112(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a3, 116(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a1, 120(a2) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a2) +; RV32IZCMP-WITH-FP-NEXT: lw t4, %lo(var+4)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw t3, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: lw t2, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 124(a2) +; RV32IZCMP-WITH-FP-NEXT: sw a1, 120(a2) +; RV32IZCMP-WITH-FP-NEXT: sw a3, 116(a2) +; RV32IZCMP-WITH-FP-NEXT: sw a4, 112(a2) +; RV32IZCMP-WITH-FP-NEXT: sw a5, 108(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s1, 104(a2) +; RV32IZCMP-WITH-FP-NEXT: sw a7, 100(a2) +; RV32IZCMP-WITH-FP-NEXT: sw t0, 96(a2) +; RV32IZCMP-WITH-FP-NEXT: sw t1, 92(a2) +; RV32IZCMP-WITH-FP-NEXT: sw ra, 88(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s11, 84(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s10, 80(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s9, 76(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s8, 72(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s7, 68(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s6, 64(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s5, 60(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s4, 56(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s3, 52(a2) +; RV32IZCMP-WITH-FP-NEXT: sw s2, 48(a2) +; RV32IZCMP-WITH-FP-NEXT: sw t6, 44(a2) +; RV32IZCMP-WITH-FP-NEXT: sw t5, 40(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -80(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, 24(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 36(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -76(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, 20(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 32(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 28(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 24(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 20(a2) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a2) +; RV32IZCMP-WITH-FP-NEXT: sw t2, %lo(var+12)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw t3, %lo(var+8)(a6) +; RV32IZCMP-WITH-FP-NEXT: sw t4, %lo(var+4)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload @@ -469,84 +469,84 @@ define void @callee() nounwind { ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var) -; RV64I-NEXT: lw a0, %lo(var)(a6) +; RV64I-NEXT: lui a4, %hi(var) +; RV64I-NEXT: lw a0, %lo(var)(a4) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+4)(a6) +; RV64I-NEXT: addi a2, a4, %lo(var) +; RV64I-NEXT: lw a0, 16(a2) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+8)(a6) +; RV64I-NEXT: lw a0, 20(a2) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-NEXT: lw a0, 24(a2) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var) -; RV64I-NEXT: lw a0, 16(a5) +; RV64I-NEXT: lw a0, 28(a2) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, 20(a5) +; RV64I-NEXT: lw a0, 32(a2) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw t0, 24(a5) -; RV64I-NEXT: lw t1, 28(a5) -; RV64I-NEXT: lw t2, 32(a5) -; RV64I-NEXT: lw t3, 36(a5) -; RV64I-NEXT: lw t4, 40(a5) -; RV64I-NEXT: lw t5, 44(a5) -; RV64I-NEXT: lw t6, 48(a5) -; RV64I-NEXT: lw s0, 52(a5) -; RV64I-NEXT: lw s1, 56(a5) -; RV64I-NEXT: lw s2, 60(a5) -; RV64I-NEXT: lw s3, 64(a5) -; RV64I-NEXT: lw s4, 68(a5) -; RV64I-NEXT: lw s5, 72(a5) -; RV64I-NEXT: lw s6, 76(a5) -; RV64I-NEXT: lw s7, 80(a5) -; RV64I-NEXT: lw s8, 84(a5) -; RV64I-NEXT: lw s9, 88(a5) -; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 96(a5) -; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) -; RV64I-NEXT: lw a4, 108(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a1, 120(a5) -; RV64I-NEXT: lw a2, 116(a5) -; RV64I-NEXT: lw a3, 112(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a1, 120(a5) -; RV64I-NEXT: sw a2, 116(a5) -; RV64I-NEXT: sw a3, 112(a5) -; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) -; RV64I-NEXT: sw ra, 100(a5) -; RV64I-NEXT: sw s11, 96(a5) -; RV64I-NEXT: sw s10, 92(a5) -; RV64I-NEXT: sw s9, 88(a5) -; RV64I-NEXT: sw s8, 84(a5) -; RV64I-NEXT: sw s7, 80(a5) -; RV64I-NEXT: sw s6, 76(a5) -; RV64I-NEXT: sw s5, 72(a5) -; RV64I-NEXT: sw s4, 68(a5) -; RV64I-NEXT: sw s3, 64(a5) -; RV64I-NEXT: sw s2, 60(a5) -; RV64I-NEXT: sw s1, 56(a5) -; RV64I-NEXT: sw s0, 52(a5) -; RV64I-NEXT: sw t6, 48(a5) -; RV64I-NEXT: sw t5, 44(a5) -; RV64I-NEXT: sw t4, 40(a5) -; RV64I-NEXT: sw t3, 36(a5) -; RV64I-NEXT: sw t2, 32(a5) -; RV64I-NEXT: sw t1, 28(a5) -; RV64I-NEXT: sw t0, 24(a5) +; RV64I-NEXT: lw t0, 36(a2) +; RV64I-NEXT: lw t1, 40(a2) +; RV64I-NEXT: lw t2, 44(a2) +; RV64I-NEXT: lw t3, 48(a2) +; RV64I-NEXT: lw t4, 52(a2) +; RV64I-NEXT: lw t5, 56(a2) +; RV64I-NEXT: lw t6, 60(a2) +; RV64I-NEXT: lw s0, 64(a2) +; RV64I-NEXT: lw s1, 68(a2) +; RV64I-NEXT: lw s2, 72(a2) +; RV64I-NEXT: lw s3, 76(a2) +; RV64I-NEXT: lw s4, 80(a2) +; RV64I-NEXT: lw s5, 84(a2) +; RV64I-NEXT: lw s6, 88(a2) +; RV64I-NEXT: lw s7, 92(a2) +; RV64I-NEXT: lw s8, 96(a2) +; RV64I-NEXT: lw s9, 100(a2) +; RV64I-NEXT: lw s10, 104(a2) +; RV64I-NEXT: lw s11, 108(a2) +; RV64I-NEXT: lw ra, 112(a2) +; RV64I-NEXT: lw a3, 116(a2) +; RV64I-NEXT: lw a1, 120(a2) +; RV64I-NEXT: lw a0, 124(a2) +; RV64I-NEXT: lw a7, %lo(var+4)(a4) +; RV64I-NEXT: lw a6, %lo(var+8)(a4) +; RV64I-NEXT: lw a5, %lo(var+12)(a4) +; RV64I-NEXT: sw a0, 124(a2) +; RV64I-NEXT: sw a1, 120(a2) +; RV64I-NEXT: sw a3, 116(a2) +; RV64I-NEXT: sw ra, 112(a2) +; RV64I-NEXT: sw s11, 108(a2) +; RV64I-NEXT: sw s10, 104(a2) +; RV64I-NEXT: sw s9, 100(a2) +; RV64I-NEXT: sw s8, 96(a2) +; RV64I-NEXT: sw s7, 92(a2) +; RV64I-NEXT: sw s6, 88(a2) +; RV64I-NEXT: sw s5, 84(a2) +; RV64I-NEXT: sw s4, 80(a2) +; RV64I-NEXT: sw s3, 76(a2) +; RV64I-NEXT: sw s2, 72(a2) +; RV64I-NEXT: sw s1, 68(a2) +; RV64I-NEXT: sw s0, 64(a2) +; RV64I-NEXT: sw t6, 60(a2) +; RV64I-NEXT: sw t5, 56(a2) +; RV64I-NEXT: sw t4, 52(a2) +; RV64I-NEXT: sw t3, 48(a2) +; RV64I-NEXT: sw t2, 44(a2) +; RV64I-NEXT: sw t1, 40(a2) +; RV64I-NEXT: sw t0, 36(a2) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 20(a5) +; RV64I-NEXT: sw a0, 32(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 16(a5) +; RV64I-NEXT: sw a0, 28(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+12)(a6) +; RV64I-NEXT: sw a0, 24(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+8)(a6) +; RV64I-NEXT: sw a0, 20(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+4)(a6) +; RV64I-NEXT: sw a0, 16(a2) +; RV64I-NEXT: sw a5, %lo(var+12)(a4) +; RV64I-NEXT: sw a6, %lo(var+8)(a4) +; RV64I-NEXT: sw a7, %lo(var+4)(a4) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var)(a6) +; RV64I-NEXT: sw a0, %lo(var)(a4) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -580,86 +580,86 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: addi s0, sp, 160 -; RV64I-WITH-FP-NEXT: lui a6, %hi(var) -; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a6) +; RV64I-WITH-FP-NEXT: lui a5, %hi(var) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a5) ; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV64I-WITH-FP-NEXT: addi a2, a5, %lo(var) +; RV64I-WITH-FP-NEXT: lw a0, 16(a2) ; RV64I-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV64I-WITH-FP-NEXT: lw a0, 20(a2) ; RV64I-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV64I-WITH-FP-NEXT: lw a0, 24(a2) ; RV64I-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: addi a5, a6, %lo(var) -; RV64I-WITH-FP-NEXT: lw a0, 16(a5) +; RV64I-WITH-FP-NEXT: lw a0, 28(a2) ; RV64I-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, 20(a5) +; RV64I-WITH-FP-NEXT: lw a0, 32(a2) ; RV64I-WITH-FP-NEXT: sd a0, -152(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, 24(a5) +; RV64I-WITH-FP-NEXT: lw a0, 36(a2) ; RV64I-WITH-FP-NEXT: sd a0, -160(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw t1, 28(a5) -; RV64I-WITH-FP-NEXT: lw t2, 32(a5) -; RV64I-WITH-FP-NEXT: lw t3, 36(a5) -; RV64I-WITH-FP-NEXT: lw t4, 40(a5) -; RV64I-WITH-FP-NEXT: lw t5, 44(a5) -; RV64I-WITH-FP-NEXT: lw t6, 48(a5) -; RV64I-WITH-FP-NEXT: lw s1, 52(a5) -; RV64I-WITH-FP-NEXT: lw s2, 56(a5) -; RV64I-WITH-FP-NEXT: lw s3, 60(a5) -; RV64I-WITH-FP-NEXT: lw s4, 64(a5) -; RV64I-WITH-FP-NEXT: lw s5, 68(a5) -; RV64I-WITH-FP-NEXT: lw s6, 72(a5) -; RV64I-WITH-FP-NEXT: lw s7, 76(a5) -; RV64I-WITH-FP-NEXT: lw s8, 80(a5) -; RV64I-WITH-FP-NEXT: lw s9, 84(a5) -; RV64I-WITH-FP-NEXT: lw s10, 88(a5) -; RV64I-WITH-FP-NEXT: lw s11, 92(a5) -; RV64I-WITH-FP-NEXT: lw ra, 96(a5) -; RV64I-WITH-FP-NEXT: lw t0, 100(a5) -; RV64I-WITH-FP-NEXT: lw a7, 104(a5) -; RV64I-WITH-FP-NEXT: lw a4, 108(a5) -; RV64I-WITH-FP-NEXT: lw a0, 124(a5) -; RV64I-WITH-FP-NEXT: lw a1, 120(a5) -; RV64I-WITH-FP-NEXT: lw a2, 116(a5) -; RV64I-WITH-FP-NEXT: lw a3, 112(a5) -; RV64I-WITH-FP-NEXT: sw a0, 124(a5) -; RV64I-WITH-FP-NEXT: sw a1, 120(a5) -; RV64I-WITH-FP-NEXT: sw a2, 116(a5) -; RV64I-WITH-FP-NEXT: sw a3, 112(a5) -; RV64I-WITH-FP-NEXT: sw a4, 108(a5) -; RV64I-WITH-FP-NEXT: sw a7, 104(a5) -; RV64I-WITH-FP-NEXT: sw t0, 100(a5) -; RV64I-WITH-FP-NEXT: sw ra, 96(a5) -; RV64I-WITH-FP-NEXT: sw s11, 92(a5) -; RV64I-WITH-FP-NEXT: sw s10, 88(a5) -; RV64I-WITH-FP-NEXT: sw s9, 84(a5) -; RV64I-WITH-FP-NEXT: sw s8, 80(a5) -; RV64I-WITH-FP-NEXT: sw s7, 76(a5) -; RV64I-WITH-FP-NEXT: sw s6, 72(a5) -; RV64I-WITH-FP-NEXT: sw s5, 68(a5) -; RV64I-WITH-FP-NEXT: sw s4, 64(a5) -; RV64I-WITH-FP-NEXT: sw s3, 60(a5) -; RV64I-WITH-FP-NEXT: sw s2, 56(a5) -; RV64I-WITH-FP-NEXT: sw s1, 52(a5) -; RV64I-WITH-FP-NEXT: sw t6, 48(a5) -; RV64I-WITH-FP-NEXT: sw t5, 44(a5) -; RV64I-WITH-FP-NEXT: sw t4, 40(a5) -; RV64I-WITH-FP-NEXT: sw t3, 36(a5) -; RV64I-WITH-FP-NEXT: sw t2, 32(a5) -; RV64I-WITH-FP-NEXT: sw t1, 28(a5) +; RV64I-WITH-FP-NEXT: lw t1, 40(a2) +; RV64I-WITH-FP-NEXT: lw t2, 44(a2) +; RV64I-WITH-FP-NEXT: lw t3, 48(a2) +; RV64I-WITH-FP-NEXT: lw t4, 52(a2) +; RV64I-WITH-FP-NEXT: lw t5, 56(a2) +; RV64I-WITH-FP-NEXT: lw t6, 60(a2) +; RV64I-WITH-FP-NEXT: lw s1, 64(a2) +; RV64I-WITH-FP-NEXT: lw s2, 68(a2) +; RV64I-WITH-FP-NEXT: lw s3, 72(a2) +; RV64I-WITH-FP-NEXT: lw s4, 76(a2) +; RV64I-WITH-FP-NEXT: lw s5, 80(a2) +; RV64I-WITH-FP-NEXT: lw s6, 84(a2) +; RV64I-WITH-FP-NEXT: lw s7, 88(a2) +; RV64I-WITH-FP-NEXT: lw s8, 92(a2) +; RV64I-WITH-FP-NEXT: lw s9, 96(a2) +; RV64I-WITH-FP-NEXT: lw s10, 100(a2) +; RV64I-WITH-FP-NEXT: lw s11, 104(a2) +; RV64I-WITH-FP-NEXT: lw ra, 108(a2) +; RV64I-WITH-FP-NEXT: lw a4, 112(a2) +; RV64I-WITH-FP-NEXT: lw a3, 116(a2) +; RV64I-WITH-FP-NEXT: lw a1, 120(a2) +; RV64I-WITH-FP-NEXT: lw a0, 124(a2) +; RV64I-WITH-FP-NEXT: lw t0, %lo(var+4)(a5) +; RV64I-WITH-FP-NEXT: lw a7, %lo(var+8)(a5) +; RV64I-WITH-FP-NEXT: lw a6, %lo(var+12)(a5) +; RV64I-WITH-FP-NEXT: sw a0, 124(a2) +; RV64I-WITH-FP-NEXT: sw a1, 120(a2) +; RV64I-WITH-FP-NEXT: sw a3, 116(a2) +; RV64I-WITH-FP-NEXT: sw a4, 112(a2) +; RV64I-WITH-FP-NEXT: sw ra, 108(a2) +; RV64I-WITH-FP-NEXT: sw s11, 104(a2) +; RV64I-WITH-FP-NEXT: sw s10, 100(a2) +; RV64I-WITH-FP-NEXT: sw s9, 96(a2) +; RV64I-WITH-FP-NEXT: sw s8, 92(a2) +; RV64I-WITH-FP-NEXT: sw s7, 88(a2) +; RV64I-WITH-FP-NEXT: sw s6, 84(a2) +; RV64I-WITH-FP-NEXT: sw s5, 80(a2) +; RV64I-WITH-FP-NEXT: sw s4, 76(a2) +; RV64I-WITH-FP-NEXT: sw s3, 72(a2) +; RV64I-WITH-FP-NEXT: sw s2, 68(a2) +; RV64I-WITH-FP-NEXT: sw s1, 64(a2) +; RV64I-WITH-FP-NEXT: sw t6, 60(a2) +; RV64I-WITH-FP-NEXT: sw t5, 56(a2) +; RV64I-WITH-FP-NEXT: sw t4, 52(a2) +; RV64I-WITH-FP-NEXT: sw t3, 48(a2) +; RV64I-WITH-FP-NEXT: sw t2, 44(a2) +; RV64I-WITH-FP-NEXT: sw t1, 40(a2) ; RV64I-WITH-FP-NEXT: ld a0, -160(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, 24(a5) +; RV64I-WITH-FP-NEXT: sw a0, 36(a2) ; RV64I-WITH-FP-NEXT: ld a0, -152(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, 20(a5) +; RV64I-WITH-FP-NEXT: sw a0, 32(a2) ; RV64I-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, 16(a5) +; RV64I-WITH-FP-NEXT: sw a0, 28(a2) ; RV64I-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV64I-WITH-FP-NEXT: sw a0, 24(a2) ; RV64I-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV64I-WITH-FP-NEXT: sw a0, 20(a2) ; RV64I-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV64I-WITH-FP-NEXT: sw a0, 16(a2) +; RV64I-WITH-FP-NEXT: sw a6, %lo(var+12)(a5) +; RV64I-WITH-FP-NEXT: sw a7, %lo(var+8)(a5) +; RV64I-WITH-FP-NEXT: sw t0, %lo(var+4)(a5) ; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a6) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a5) ; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -679,84 +679,84 @@ define void @callee() nounwind { ; RV64IZCMP-LABEL: callee: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a6, %hi(var) -; RV64IZCMP-NEXT: lw a0, %lo(var)(a6) +; RV64IZCMP-NEXT: lui a5, %hi(var) +; RV64IZCMP-NEXT: lw a0, %lo(var)(a5) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a6) +; RV64IZCMP-NEXT: addi a2, a5, %lo(var) +; RV64IZCMP-NEXT: lw a0, 16(a2) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a6) +; RV64IZCMP-NEXT: lw a0, 20(a2) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a6) +; RV64IZCMP-NEXT: lw a0, 24(a2) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var) -; RV64IZCMP-NEXT: lw a0, 16(a5) +; RV64IZCMP-NEXT: lw a0, 28(a2) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, 20(a5) +; RV64IZCMP-NEXT: lw a0, 32(a2) ; RV64IZCMP-NEXT: sd a0, 0(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw t4, 24(a5) -; RV64IZCMP-NEXT: lw t5, 28(a5) -; RV64IZCMP-NEXT: lw t6, 32(a5) -; RV64IZCMP-NEXT: lw s2, 36(a5) -; RV64IZCMP-NEXT: lw s3, 40(a5) -; RV64IZCMP-NEXT: lw s4, 44(a5) -; RV64IZCMP-NEXT: lw s5, 48(a5) -; RV64IZCMP-NEXT: lw s6, 52(a5) -; RV64IZCMP-NEXT: lw s7, 56(a5) -; RV64IZCMP-NEXT: lw s8, 60(a5) -; RV64IZCMP-NEXT: lw s9, 64(a5) -; RV64IZCMP-NEXT: lw s10, 68(a5) -; RV64IZCMP-NEXT: lw s11, 72(a5) -; RV64IZCMP-NEXT: lw ra, 76(a5) -; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw t0, 96(a5) -; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) -; RV64IZCMP-NEXT: lw a4, 108(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a1, 120(a5) -; RV64IZCMP-NEXT: lw a2, 116(a5) -; RV64IZCMP-NEXT: lw a3, 112(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a1, 120(a5) -; RV64IZCMP-NEXT: sw a2, 116(a5) -; RV64IZCMP-NEXT: sw a3, 112(a5) -; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) -; RV64IZCMP-NEXT: sw s0, 100(a5) -; RV64IZCMP-NEXT: sw t0, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) -; RV64IZCMP-NEXT: sw s1, 80(a5) -; RV64IZCMP-NEXT: sw ra, 76(a5) -; RV64IZCMP-NEXT: sw s11, 72(a5) -; RV64IZCMP-NEXT: sw s10, 68(a5) -; RV64IZCMP-NEXT: sw s9, 64(a5) -; RV64IZCMP-NEXT: sw s8, 60(a5) -; RV64IZCMP-NEXT: sw s7, 56(a5) -; RV64IZCMP-NEXT: sw s6, 52(a5) -; RV64IZCMP-NEXT: sw s5, 48(a5) -; RV64IZCMP-NEXT: sw s4, 44(a5) -; RV64IZCMP-NEXT: sw s3, 40(a5) -; RV64IZCMP-NEXT: sw s2, 36(a5) -; RV64IZCMP-NEXT: sw t6, 32(a5) -; RV64IZCMP-NEXT: sw t5, 28(a5) -; RV64IZCMP-NEXT: sw t4, 24(a5) +; RV64IZCMP-NEXT: lw t4, 36(a2) +; RV64IZCMP-NEXT: lw t5, 40(a2) +; RV64IZCMP-NEXT: lw t6, 44(a2) +; RV64IZCMP-NEXT: lw s2, 48(a2) +; RV64IZCMP-NEXT: lw s3, 52(a2) +; RV64IZCMP-NEXT: lw s4, 56(a2) +; RV64IZCMP-NEXT: lw s5, 60(a2) +; RV64IZCMP-NEXT: lw s6, 64(a2) +; RV64IZCMP-NEXT: lw s7, 68(a2) +; RV64IZCMP-NEXT: lw s8, 72(a2) +; RV64IZCMP-NEXT: lw s9, 76(a2) +; RV64IZCMP-NEXT: lw s10, 80(a2) +; RV64IZCMP-NEXT: lw s11, 84(a2) +; RV64IZCMP-NEXT: lw ra, 88(a2) +; RV64IZCMP-NEXT: lw s1, 92(a2) +; RV64IZCMP-NEXT: lw t0, 96(a2) +; RV64IZCMP-NEXT: lw a7, 100(a2) +; RV64IZCMP-NEXT: lw a6, 104(a2) +; RV64IZCMP-NEXT: lw a4, 108(a2) +; RV64IZCMP-NEXT: lw s0, 112(a2) +; RV64IZCMP-NEXT: lw a3, 116(a2) +; RV64IZCMP-NEXT: lw a1, 120(a2) +; RV64IZCMP-NEXT: lw a0, 124(a2) +; RV64IZCMP-NEXT: lw t3, %lo(var+4)(a5) +; RV64IZCMP-NEXT: lw t2, %lo(var+8)(a5) +; RV64IZCMP-NEXT: lw t1, %lo(var+12)(a5) +; RV64IZCMP-NEXT: sw a0, 124(a2) +; RV64IZCMP-NEXT: sw a1, 120(a2) +; RV64IZCMP-NEXT: sw a3, 116(a2) +; RV64IZCMP-NEXT: sw s0, 112(a2) +; RV64IZCMP-NEXT: sw a4, 108(a2) +; RV64IZCMP-NEXT: sw a6, 104(a2) +; RV64IZCMP-NEXT: sw a7, 100(a2) +; RV64IZCMP-NEXT: sw t0, 96(a2) +; RV64IZCMP-NEXT: sw s1, 92(a2) +; RV64IZCMP-NEXT: sw ra, 88(a2) +; RV64IZCMP-NEXT: sw s11, 84(a2) +; RV64IZCMP-NEXT: sw s10, 80(a2) +; RV64IZCMP-NEXT: sw s9, 76(a2) +; RV64IZCMP-NEXT: sw s8, 72(a2) +; RV64IZCMP-NEXT: sw s7, 68(a2) +; RV64IZCMP-NEXT: sw s6, 64(a2) +; RV64IZCMP-NEXT: sw s5, 60(a2) +; RV64IZCMP-NEXT: sw s4, 56(a2) +; RV64IZCMP-NEXT: sw s3, 52(a2) +; RV64IZCMP-NEXT: sw s2, 48(a2) +; RV64IZCMP-NEXT: sw t6, 44(a2) +; RV64IZCMP-NEXT: sw t5, 40(a2) +; RV64IZCMP-NEXT: sw t4, 36(a2) ; RV64IZCMP-NEXT: ld a0, 0(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 20(a5) +; RV64IZCMP-NEXT: sw a0, 32(a2) ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 16(a5) +; RV64IZCMP-NEXT: sw a0, 28(a2) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a6) +; RV64IZCMP-NEXT: sw a0, 24(a2) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a6) +; RV64IZCMP-NEXT: sw a0, 20(a2) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a6) +; RV64IZCMP-NEXT: sw a0, 16(a2) +; RV64IZCMP-NEXT: sw t1, %lo(var+12)(a5) +; RV64IZCMP-NEXT: sw t2, %lo(var+8)(a5) +; RV64IZCMP-NEXT: sw t3, %lo(var+4)(a5) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var)(a5) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV64IZCMP-WITH-FP-LABEL: callee: @@ -779,81 +779,81 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: lui a6, %hi(var) ; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) +; RV64IZCMP-WITH-FP-NEXT: addi a2, a6, %lo(var) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 24(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) -; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 28(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 32(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -152(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, 24(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 36(a2) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -160(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw t5, 28(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t6, 32(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s2, 36(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s3, 40(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s4, 44(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s5, 48(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s6, 52(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s7, 56(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s8, 60(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s9, 64(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s10, 68(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s11, 72(a5) -; RV64IZCMP-WITH-FP-NEXT: lw ra, 76(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t4, 80(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t3, 84(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t2, 88(a5) -; RV64IZCMP-WITH-FP-NEXT: lw s1, 92(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t1, 96(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a7, 104(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a4, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a1, 120(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a2, 116(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a3, 112(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a0, 124(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a1, 120(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a2, 116(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a3, 112(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a7, 104(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t0, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t1, 96(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s1, 92(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t2, 88(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t3, 84(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t4, 80(a5) -; RV64IZCMP-WITH-FP-NEXT: sw ra, 76(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s11, 72(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s10, 68(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s9, 64(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s8, 60(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s7, 56(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s6, 52(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s5, 48(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s4, 44(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s3, 40(a5) -; RV64IZCMP-WITH-FP-NEXT: sw s2, 36(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t6, 32(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t5, 28(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t5, 40(a2) +; RV64IZCMP-WITH-FP-NEXT: lw t6, 44(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s2, 48(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s3, 52(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s4, 56(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s5, 60(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s6, 64(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s7, 68(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s8, 72(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s9, 76(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s10, 80(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s11, 84(a2) +; RV64IZCMP-WITH-FP-NEXT: lw ra, 88(a2) +; RV64IZCMP-WITH-FP-NEXT: lw t1, 92(a2) +; RV64IZCMP-WITH-FP-NEXT: lw t0, 96(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a7, 100(a2) +; RV64IZCMP-WITH-FP-NEXT: lw s1, 104(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a5, 108(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a4, 112(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a3, 116(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a1, 120(a2) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a2) +; RV64IZCMP-WITH-FP-NEXT: lw t4, %lo(var+4)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw t3, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: lw t2, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 124(a2) +; RV64IZCMP-WITH-FP-NEXT: sw a1, 120(a2) +; RV64IZCMP-WITH-FP-NEXT: sw a3, 116(a2) +; RV64IZCMP-WITH-FP-NEXT: sw a4, 112(a2) +; RV64IZCMP-WITH-FP-NEXT: sw a5, 108(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s1, 104(a2) +; RV64IZCMP-WITH-FP-NEXT: sw a7, 100(a2) +; RV64IZCMP-WITH-FP-NEXT: sw t0, 96(a2) +; RV64IZCMP-WITH-FP-NEXT: sw t1, 92(a2) +; RV64IZCMP-WITH-FP-NEXT: sw ra, 88(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s11, 84(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s10, 80(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s9, 76(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s8, 72(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s7, 68(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s6, 64(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s5, 60(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s4, 56(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s3, 52(a2) +; RV64IZCMP-WITH-FP-NEXT: sw s2, 48(a2) +; RV64IZCMP-WITH-FP-NEXT: sw t6, 44(a2) +; RV64IZCMP-WITH-FP-NEXT: sw t5, 40(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -160(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, 24(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 36(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -152(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, 20(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 32(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 28(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 24(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 20(a2) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a2) +; RV64IZCMP-WITH-FP-NEXT: sw t2, %lo(var+12)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw t3, %lo(var+8)(a6) +; RV64IZCMP-WITH-FP-NEXT: sw t4, %lo(var+4)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll index 649234efaad90..3407a99497fad 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -191,20 +191,20 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-FPELIM-LABEL: callee_large_scalars: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a2, 0(a1) -; RV32I-FPELIM-NEXT: lw a3, 0(a0) -; RV32I-FPELIM-NEXT: lw a4, 4(a1) -; RV32I-FPELIM-NEXT: lw a5, 12(a1) -; RV32I-FPELIM-NEXT: lw a6, 12(a0) -; RV32I-FPELIM-NEXT: lw a7, 4(a0) -; RV32I-FPELIM-NEXT: lw a1, 8(a1) +; RV32I-FPELIM-NEXT: lw a3, 4(a1) +; RV32I-FPELIM-NEXT: lw a4, 8(a1) +; RV32I-FPELIM-NEXT: lw a1, 12(a1) +; RV32I-FPELIM-NEXT: lw a5, 12(a0) +; RV32I-FPELIM-NEXT: lw a6, 4(a0) +; RV32I-FPELIM-NEXT: lw a7, 0(a0) ; RV32I-FPELIM-NEXT: lw a0, 8(a0) -; RV32I-FPELIM-NEXT: xor a5, a6, a5 -; RV32I-FPELIM-NEXT: xor a4, a7, a4 -; RV32I-FPELIM-NEXT: or a4, a4, a5 -; RV32I-FPELIM-NEXT: xor a0, a0, a1 -; RV32I-FPELIM-NEXT: xor a2, a3, a2 +; RV32I-FPELIM-NEXT: xor a1, a5, a1 +; RV32I-FPELIM-NEXT: xor a3, a6, a3 +; RV32I-FPELIM-NEXT: or a1, a3, a1 +; RV32I-FPELIM-NEXT: xor a0, a0, a4 +; RV32I-FPELIM-NEXT: xor a2, a7, a2 ; RV32I-FPELIM-NEXT: or a0, a2, a0 -; RV32I-FPELIM-NEXT: or a0, a0, a4 +; RV32I-FPELIM-NEXT: or a0, a0, a1 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -215,20 +215,20 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a2, 0(a1) -; RV32I-WITHFP-NEXT: lw a3, 0(a0) -; RV32I-WITHFP-NEXT: lw a4, 4(a1) -; RV32I-WITHFP-NEXT: lw a5, 12(a1) -; RV32I-WITHFP-NEXT: lw a6, 12(a0) -; RV32I-WITHFP-NEXT: lw a7, 4(a0) -; RV32I-WITHFP-NEXT: lw a1, 8(a1) +; RV32I-WITHFP-NEXT: lw a3, 4(a1) +; RV32I-WITHFP-NEXT: lw a4, 8(a1) +; RV32I-WITHFP-NEXT: lw a1, 12(a1) +; RV32I-WITHFP-NEXT: lw a5, 12(a0) +; RV32I-WITHFP-NEXT: lw a6, 4(a0) +; RV32I-WITHFP-NEXT: lw a7, 0(a0) ; RV32I-WITHFP-NEXT: lw a0, 8(a0) -; RV32I-WITHFP-NEXT: xor a5, a6, a5 -; RV32I-WITHFP-NEXT: xor a4, a7, a4 -; RV32I-WITHFP-NEXT: or a4, a4, a5 -; RV32I-WITHFP-NEXT: xor a0, a0, a1 -; RV32I-WITHFP-NEXT: xor a2, a3, a2 +; RV32I-WITHFP-NEXT: xor a1, a5, a1 +; RV32I-WITHFP-NEXT: xor a3, a6, a3 +; RV32I-WITHFP-NEXT: or a1, a3, a1 +; RV32I-WITHFP-NEXT: xor a0, a0, a4 +; RV32I-WITHFP-NEXT: xor a2, a7, a2 ; RV32I-WITHFP-NEXT: or a0, a2, a0 -; RV32I-WITHFP-NEXT: or a0, a0, a4 +; RV32I-WITHFP-NEXT: or a0, a0, a1 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -298,20 +298,20 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a0, 4(sp) ; RV32I-FPELIM-NEXT: lw a1, 0(a0) -; RV32I-FPELIM-NEXT: lw a2, 0(a7) -; RV32I-FPELIM-NEXT: lw a3, 4(a0) -; RV32I-FPELIM-NEXT: lw a4, 12(a0) -; RV32I-FPELIM-NEXT: lw a5, 12(a7) -; RV32I-FPELIM-NEXT: lw a6, 4(a7) -; RV32I-FPELIM-NEXT: lw a0, 8(a0) +; RV32I-FPELIM-NEXT: lw a2, 4(a0) +; RV32I-FPELIM-NEXT: lw a3, 8(a0) +; RV32I-FPELIM-NEXT: lw a0, 12(a0) +; RV32I-FPELIM-NEXT: lw a4, 12(a7) +; RV32I-FPELIM-NEXT: lw a5, 4(a7) +; RV32I-FPELIM-NEXT: lw a6, 0(a7) ; RV32I-FPELIM-NEXT: lw a7, 8(a7) -; RV32I-FPELIM-NEXT: xor a4, a5, a4 -; RV32I-FPELIM-NEXT: xor a3, a6, a3 -; RV32I-FPELIM-NEXT: or a3, a3, a4 -; RV32I-FPELIM-NEXT: xor a0, a7, a0 -; RV32I-FPELIM-NEXT: xor a1, a2, a1 +; RV32I-FPELIM-NEXT: xor a0, a4, a0 +; RV32I-FPELIM-NEXT: xor a2, a5, a2 +; RV32I-FPELIM-NEXT: or a0, a2, a0 +; RV32I-FPELIM-NEXT: xor a2, a7, a3 +; RV32I-FPELIM-NEXT: xor a1, a6, a1 +; RV32I-FPELIM-NEXT: or a1, a1, a2 ; RV32I-FPELIM-NEXT: or a0, a1, a0 -; RV32I-FPELIM-NEXT: or a0, a0, a3 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -323,20 +323,20 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a0, 4(s0) ; RV32I-WITHFP-NEXT: lw a1, 0(a0) -; RV32I-WITHFP-NEXT: lw a2, 0(a7) -; RV32I-WITHFP-NEXT: lw a3, 4(a0) -; RV32I-WITHFP-NEXT: lw a4, 12(a0) -; RV32I-WITHFP-NEXT: lw a5, 12(a7) -; RV32I-WITHFP-NEXT: lw a6, 4(a7) -; RV32I-WITHFP-NEXT: lw a0, 8(a0) +; RV32I-WITHFP-NEXT: lw a2, 4(a0) +; RV32I-WITHFP-NEXT: lw a3, 8(a0) +; RV32I-WITHFP-NEXT: lw a0, 12(a0) +; RV32I-WITHFP-NEXT: lw a4, 12(a7) +; RV32I-WITHFP-NEXT: lw a5, 4(a7) +; RV32I-WITHFP-NEXT: lw a6, 0(a7) ; RV32I-WITHFP-NEXT: lw a7, 8(a7) -; RV32I-WITHFP-NEXT: xor a4, a5, a4 -; RV32I-WITHFP-NEXT: xor a3, a6, a3 -; RV32I-WITHFP-NEXT: or a3, a3, a4 -; RV32I-WITHFP-NEXT: xor a0, a7, a0 -; RV32I-WITHFP-NEXT: xor a1, a2, a1 +; RV32I-WITHFP-NEXT: xor a0, a4, a0 +; RV32I-WITHFP-NEXT: xor a2, a5, a2 +; RV32I-WITHFP-NEXT: or a0, a2, a0 +; RV32I-WITHFP-NEXT: xor a2, a7, a3 +; RV32I-WITHFP-NEXT: xor a1, a6, a1 +; RV32I-WITHFP-NEXT: or a1, a1, a2 ; RV32I-WITHFP-NEXT: or a0, a1, a0 -; RV32I-WITHFP-NEXT: or a0, a0, a3 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll index c2690d15665e2..f99bb598f4834 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -107,20 +107,20 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind { ; RV64I-LABEL: callee_large_scalars: ; RV64I: # %bb.0: ; RV64I-NEXT: ld a2, 0(a1) -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: ld a4, 8(a1) -; RV64I-NEXT: ld a5, 24(a1) -; RV64I-NEXT: ld a6, 24(a0) -; RV64I-NEXT: ld a7, 8(a0) -; RV64I-NEXT: ld a1, 16(a1) +; RV64I-NEXT: ld a3, 8(a1) +; RV64I-NEXT: ld a4, 16(a1) +; RV64I-NEXT: ld a1, 24(a1) +; RV64I-NEXT: ld a5, 24(a0) +; RV64I-NEXT: ld a6, 8(a0) +; RV64I-NEXT: ld a7, 0(a0) ; RV64I-NEXT: ld a0, 16(a0) -; RV64I-NEXT: xor a5, a6, a5 -; RV64I-NEXT: xor a4, a7, a4 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: xor a2, a3, a2 +; RV64I-NEXT: xor a1, a5, a1 +; RV64I-NEXT: xor a3, a6, a3 +; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: xor a0, a0, a4 +; RV64I-NEXT: xor a2, a7, a2 ; RV64I-NEXT: or a0, a2, a0 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %a, %b @@ -162,20 +162,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d, ; RV64I: # %bb.0: ; RV64I-NEXT: ld a0, 8(sp) ; RV64I-NEXT: ld a1, 0(a0) -; RV64I-NEXT: ld a2, 0(a7) -; RV64I-NEXT: ld a3, 8(a0) -; RV64I-NEXT: ld a4, 24(a0) -; RV64I-NEXT: ld a5, 24(a7) -; RV64I-NEXT: ld a6, 8(a7) -; RV64I-NEXT: ld a0, 16(a0) +; RV64I-NEXT: ld a2, 8(a0) +; RV64I-NEXT: ld a3, 16(a0) +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: ld a4, 24(a7) +; RV64I-NEXT: ld a5, 8(a7) +; RV64I-NEXT: ld a6, 0(a7) ; RV64I-NEXT: ld a7, 16(a7) -; RV64I-NEXT: xor a4, a5, a4 -; RV64I-NEXT: xor a3, a6, a3 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: xor a0, a7, a0 -; RV64I-NEXT: xor a1, a2, a1 +; RV64I-NEXT: xor a0, a4, a0 +; RV64I-NEXT: xor a2, a5, a2 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: xor a2, a7, a3 +; RV64I-NEXT: xor a1, a6, a1 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %h, %j diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll index f2079e314d51c..bfa14b52f18e9 100644 --- a/llvm/test/CodeGen/RISCV/forced-atomics.ll +++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll @@ -3348,8 +3348,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: j .LBB49_2 ; RV32-NEXT: .LBB49_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB49_2 Depth=1 @@ -3362,8 +3362,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: bnez a0, .LBB49_6 ; RV32-NEXT: .LBB49_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3453,8 +3453,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: j .LBB50_2 ; RV32-NEXT: .LBB50_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB50_2 Depth=1 @@ -3467,8 +3467,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: bnez a0, .LBB50_6 ; RV32-NEXT: .LBB50_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3560,8 +3560,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: j .LBB51_2 ; RV32-NEXT: .LBB51_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1 @@ -3574,8 +3574,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: bnez a0, .LBB51_4 ; RV32-NEXT: .LBB51_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3652,8 +3652,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: lw a4, 0(a0) +; RV32-NEXT: lw a1, 4(a0) ; RV32-NEXT: j .LBB52_2 ; RV32-NEXT: .LBB52_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB52_2 Depth=1 @@ -3666,8 +3666,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: bnez a0, .LBB52_4 ; RV32-NEXT: .LBB52_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3802,30 +3802,30 @@ define double @rmw64_fadd_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw s1, 4(a0) -; RV32-NEXT: lw s2, 0(a0) +; RV32-NEXT: lw s1, 0(a0) +; RV32-NEXT: lw s2, 4(a0) ; RV32-NEXT: .LBB54_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lui a3, 261888 -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: call __adddf3@plt ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: mv a3, a1 -; RV32-NEXT: sw s2, 8(sp) -; RV32-NEXT: sw s1, 12(sp) +; RV32-NEXT: sw s1, 8(sp) +; RV32-NEXT: sw s2, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw s1, 12(sp) -; RV32-NEXT: lw s2, 8(sp) +; RV32-NEXT: lw s1, 8(sp) +; RV32-NEXT: lw s2, 12(sp) ; RV32-NEXT: beqz a0, .LBB54_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3937,30 +3937,30 @@ define double @rmw64_fsub_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw s1, 4(a0) -; RV32-NEXT: lw s2, 0(a0) +; RV32-NEXT: lw s1, 0(a0) +; RV32-NEXT: lw s2, 4(a0) ; RV32-NEXT: .LBB55_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lui a3, 786176 -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: call __adddf3@plt ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: mv a3, a1 -; RV32-NEXT: sw s2, 8(sp) -; RV32-NEXT: sw s1, 12(sp) +; RV32-NEXT: sw s1, 8(sp) +; RV32-NEXT: sw s2, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw s1, 12(sp) -; RV32-NEXT: lw s2, 8(sp) +; RV32-NEXT: lw s1, 8(sp) +; RV32-NEXT: lw s2, 12(sp) ; RV32-NEXT: beqz a0, .LBB55_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -4072,30 +4072,30 @@ define double @rmw64_fmin_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw s1, 4(a0) -; RV32-NEXT: lw s2, 0(a0) +; RV32-NEXT: lw s1, 0(a0) +; RV32-NEXT: lw s2, 4(a0) ; RV32-NEXT: .LBB56_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lui a3, 261888 -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: call fmin@plt ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: mv a3, a1 -; RV32-NEXT: sw s2, 8(sp) -; RV32-NEXT: sw s1, 12(sp) +; RV32-NEXT: sw s1, 8(sp) +; RV32-NEXT: sw s2, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw s1, 12(sp) -; RV32-NEXT: lw s2, 8(sp) +; RV32-NEXT: lw s1, 8(sp) +; RV32-NEXT: lw s2, 12(sp) ; RV32-NEXT: beqz a0, .LBB56_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -4207,30 +4207,30 @@ define double @rmw64_fmax_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw s1, 4(a0) -; RV32-NEXT: lw s2, 0(a0) +; RV32-NEXT: lw s1, 0(a0) +; RV32-NEXT: lw s2, 4(a0) ; RV32-NEXT: .LBB57_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lui a3, 261888 -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: li a2, 0 ; RV32-NEXT: call fmax@plt ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: mv a3, a1 -; RV32-NEXT: sw s2, 8(sp) -; RV32-NEXT: sw s1, 12(sp) +; RV32-NEXT: sw s1, 8(sp) +; RV32-NEXT: sw s2, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw s1, 12(sp) -; RV32-NEXT: lw s2, 8(sp) +; RV32-NEXT: lw s1, 8(sp) +; RV32-NEXT: lw s2, 12(sp) ; RV32-NEXT: beqz a0, .LBB57_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -4346,8 +4346,8 @@ define i64 @cmpxchg64_monotonic(ptr %p) nounwind { ; RV32-NEXT: li a4, 0 ; RV32-NEXT: li a5, 0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a0, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4406,8 +4406,8 @@ define i64 @cmpxchg64_seq_cst(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: li a3, 0 ; RV32-NEXT: call __atomic_compare_exchange_8@plt -; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw a0, 0(sp) +; RV32-NEXT: lw a1, 4(sp) ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -4531,25 +4531,25 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a1 -; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: lw a2, 8(s0) -; RV32-NEXT: lw a3, 4(s0) -; RV32-NEXT: lw a4, 0(s0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 4(s0) +; RV32-NEXT: lw a3, 8(s0) +; RV32-NEXT: lw a4, 12(s0) ; RV32-NEXT: mv s1, a0 ; RV32-NEXT: .LBB62_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: addi a0, a4, 1 +; RV32-NEXT: addi a0, a1, 1 ; RV32-NEXT: seqz a5, a0 -; RV32-NEXT: add a5, a3, a5 +; RV32-NEXT: add a5, a2, a5 ; RV32-NEXT: or a6, a0, a5 ; RV32-NEXT: seqz a6, a6 -; RV32-NEXT: add a6, a2, a6 -; RV32-NEXT: sltu a7, a6, a2 -; RV32-NEXT: add a7, a1, a7 -; RV32-NEXT: sw a4, 16(sp) -; RV32-NEXT: sw a3, 20(sp) -; RV32-NEXT: sw a2, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: add a6, a3, a6 +; RV32-NEXT: sltu a7, a6, a3 +; RV32-NEXT: add a7, a4, a7 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: sw a3, 24(sp) +; RV32-NEXT: sw a4, 28(sp) ; RV32-NEXT: sw a5, 4(sp) ; RV32-NEXT: sw a0, 0(sp) ; RV32-NEXT: sw a6, 8(sp) @@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a1, s0 ; RV32-NEXT: call __atomic_compare_exchange@plt -; RV32-NEXT: lw a1, 28(sp) -; RV32-NEXT: lw a2, 24(sp) -; RV32-NEXT: lw a3, 20(sp) -; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a1, 16(sp) +; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a3, 24(sp) +; RV32-NEXT: lw a4, 28(sp) ; RV32-NEXT: beqz a0, .LBB62_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: sw a4, 0(s1) -; RV32-NEXT: sw a3, 4(s1) -; RV32-NEXT: sw a2, 8(s1) -; RV32-NEXT: sw a1, 12(s1) +; RV32-NEXT: sw a1, 0(s1) +; RV32-NEXT: sw a2, 4(s1) +; RV32-NEXT: sw a3, 8(s1) +; RV32-NEXT: sw a4, 12(s1) ; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -4639,8 +4639,8 @@ define i128 @cmpxchg128(ptr %p) nounwind { ; RV64-NEXT: li a5, 5 ; RV64-NEXT: li a3, 0 ; RV64-NEXT: call __atomic_compare_exchange_16@plt -; RV64-NEXT: ld a1, 8(sp) ; RV64-NEXT: ld a0, 0(sp) +; RV64-NEXT: ld a1, 8(sp) ; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 32 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index b091b0613c0f3..ae42cd6f8128b 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -1043,24 +1043,24 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti@plt -; RV32IF-NEXT: lw a0, 20(sp) -; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a2, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a4, 8(sp) -; RV32IF-NEXT: lui a3, 524288 -; RV32IF-NEXT: addi a5, a3, -1 +; RV32IF-NEXT: lw a3, 16(sp) +; RV32IF-NEXT: lw a4, 20(sp) +; RV32IF-NEXT: lui a0, 524288 +; RV32IF-NEXT: addi a5, a0, -1 ; RV32IF-NEXT: beq a1, a5, .LBB18_2 ; RV32IF-NEXT: # %bb.1: # %entry ; RV32IF-NEXT: sltu a6, a1, a5 -; RV32IF-NEXT: or a7, a2, a0 +; RV32IF-NEXT: or a7, a3, a4 ; RV32IF-NEXT: bnez a7, .LBB18_3 ; RV32IF-NEXT: j .LBB18_4 ; RV32IF-NEXT: .LBB18_2: -; RV32IF-NEXT: sltiu a6, a4, -1 -; RV32IF-NEXT: or a7, a2, a0 +; RV32IF-NEXT: sltiu a6, a2, -1 +; RV32IF-NEXT: or a7, a3, a4 ; RV32IF-NEXT: beqz a7, .LBB18_4 ; RV32IF-NEXT: .LBB18_3: # %entry -; RV32IF-NEXT: slti a6, a0, 0 +; RV32IF-NEXT: slti a6, a4, 0 ; RV32IF-NEXT: .LBB18_4: # %entry ; RV32IF-NEXT: neg a7, a6 ; RV32IF-NEXT: addi t0, a6, -1 @@ -1068,21 +1068,21 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB18_6: # %entry -; RV32IF-NEXT: or a4, t0, a4 -; RV32IF-NEXT: and a5, a7, a0 -; RV32IF-NEXT: and a2, a7, a2 -; RV32IF-NEXT: beq a1, a3, .LBB18_8 +; RV32IF-NEXT: or a2, t0, a2 +; RV32IF-NEXT: and a4, a7, a4 +; RV32IF-NEXT: and a3, a7, a3 +; RV32IF-NEXT: beq a1, a0, .LBB18_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a3, a1 +; RV32IF-NEXT: sltu a0, a0, a1 ; RV32IF-NEXT: j .LBB18_9 ; RV32IF-NEXT: .LBB18_8: -; RV32IF-NEXT: snez a0, a4 +; RV32IF-NEXT: snez a0, a2 ; RV32IF-NEXT: .LBB18_9: # %entry -; RV32IF-NEXT: and a2, a2, a5 -; RV32IF-NEXT: li a3, -1 -; RV32IF-NEXT: beq a2, a3, .LBB18_11 +; RV32IF-NEXT: and a3, a3, a4 +; RV32IF-NEXT: li a5, -1 +; RV32IF-NEXT: beq a3, a5, .LBB18_11 ; RV32IF-NEXT: # %bb.10: # %entry -; RV32IF-NEXT: slti a0, a5, 0 +; RV32IF-NEXT: slti a0, a4, 0 ; RV32IF-NEXT: xori a0, a0, 1 ; RV32IF-NEXT: .LBB18_11: # %entry ; RV32IF-NEXT: bnez a0, .LBB18_13 @@ -1090,7 +1090,7 @@ define i64 @stest_f64i64(double %x) { ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB18_13: # %entry ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: and a0, a0, a4 +; RV32IF-NEXT: and a0, a0, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -1142,24 +1142,24 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti@plt -; RV32IFD-NEXT: lw a0, 20(sp) -; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a2, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a4, 8(sp) -; RV32IFD-NEXT: lui a3, 524288 -; RV32IFD-NEXT: addi a5, a3, -1 +; RV32IFD-NEXT: lw a3, 16(sp) +; RV32IFD-NEXT: lw a4, 20(sp) +; RV32IFD-NEXT: lui a0, 524288 +; RV32IFD-NEXT: addi a5, a0, -1 ; RV32IFD-NEXT: beq a1, a5, .LBB18_2 ; RV32IFD-NEXT: # %bb.1: # %entry ; RV32IFD-NEXT: sltu a6, a1, a5 -; RV32IFD-NEXT: or a7, a2, a0 +; RV32IFD-NEXT: or a7, a3, a4 ; RV32IFD-NEXT: bnez a7, .LBB18_3 ; RV32IFD-NEXT: j .LBB18_4 ; RV32IFD-NEXT: .LBB18_2: -; RV32IFD-NEXT: sltiu a6, a4, -1 -; RV32IFD-NEXT: or a7, a2, a0 +; RV32IFD-NEXT: sltiu a6, a2, -1 +; RV32IFD-NEXT: or a7, a3, a4 ; RV32IFD-NEXT: beqz a7, .LBB18_4 ; RV32IFD-NEXT: .LBB18_3: # %entry -; RV32IFD-NEXT: slti a6, a0, 0 +; RV32IFD-NEXT: slti a6, a4, 0 ; RV32IFD-NEXT: .LBB18_4: # %entry ; RV32IFD-NEXT: neg a7, a6 ; RV32IFD-NEXT: addi t0, a6, -1 @@ -1167,21 +1167,21 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB18_6: # %entry -; RV32IFD-NEXT: or a4, t0, a4 -; RV32IFD-NEXT: and a5, a7, a0 -; RV32IFD-NEXT: and a2, a7, a2 -; RV32IFD-NEXT: beq a1, a3, .LBB18_8 +; RV32IFD-NEXT: or a2, t0, a2 +; RV32IFD-NEXT: and a4, a7, a4 +; RV32IFD-NEXT: and a3, a7, a3 +; RV32IFD-NEXT: beq a1, a0, .LBB18_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a3, a1 +; RV32IFD-NEXT: sltu a0, a0, a1 ; RV32IFD-NEXT: j .LBB18_9 ; RV32IFD-NEXT: .LBB18_8: -; RV32IFD-NEXT: snez a0, a4 +; RV32IFD-NEXT: snez a0, a2 ; RV32IFD-NEXT: .LBB18_9: # %entry -; RV32IFD-NEXT: and a2, a2, a5 -; RV32IFD-NEXT: li a3, -1 -; RV32IFD-NEXT: beq a2, a3, .LBB18_11 +; RV32IFD-NEXT: and a3, a3, a4 +; RV32IFD-NEXT: li a5, -1 +; RV32IFD-NEXT: beq a3, a5, .LBB18_11 ; RV32IFD-NEXT: # %bb.10: # %entry -; RV32IFD-NEXT: slti a0, a5, 0 +; RV32IFD-NEXT: slti a0, a4, 0 ; RV32IFD-NEXT: xori a0, a0, 1 ; RV32IFD-NEXT: .LBB18_11: # %entry ; RV32IFD-NEXT: bnez a0, .LBB18_13 @@ -1189,7 +1189,7 @@ define i64 @stest_f64i64(double %x) { ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB18_13: # %entry ; RV32IFD-NEXT: neg a0, a0 -; RV32IFD-NEXT: and a0, a0, a4 +; RV32IFD-NEXT: and a0, a0, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -1225,18 +1225,18 @@ define i64 @utest_f64i64(double %x) { ; RV32IF-NEXT: call __fixunsdfti@plt ; RV32IF-NEXT: lw a0, 16(sp) ; RV32IF-NEXT: lw a1, 20(sp) -; RV32IF-NEXT: lw a2, 12(sp) -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 +; RV32IF-NEXT: lw a2, 8(sp) +; RV32IF-NEXT: lw a3, 12(sp) +; RV32IF-NEXT: xori a4, a0, 1 +; RV32IF-NEXT: or a4, a4, a1 ; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: xori a0, a0, 1 -; RV32IF-NEXT: or a0, a0, a1 +; RV32IF-NEXT: addi a4, a4, -1 +; RV32IF-NEXT: or a0, a1, a0 ; RV32IF-NEXT: seqz a0, a0 -; RV32IF-NEXT: addi a0, a0, -1 -; RV32IF-NEXT: and a0, a0, a4 +; RV32IF-NEXT: and a0, a4, a0 ; RV32IF-NEXT: neg a1, a0 -; RV32IF-NEXT: and a0, a1, a3 -; RV32IF-NEXT: and a1, a1, a2 +; RV32IF-NEXT: and a0, a1, a2 +; RV32IF-NEXT: and a1, a1, a3 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -1265,18 +1265,18 @@ define i64 @utest_f64i64(double %x) { ; RV32IFD-NEXT: call __fixunsdfti@plt ; RV32IFD-NEXT: lw a0, 16(sp) ; RV32IFD-NEXT: lw a1, 20(sp) -; RV32IFD-NEXT: lw a2, 12(sp) -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 +; RV32IFD-NEXT: lw a2, 8(sp) +; RV32IFD-NEXT: lw a3, 12(sp) +; RV32IFD-NEXT: xori a4, a0, 1 +; RV32IFD-NEXT: or a4, a4, a1 ; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: xori a0, a0, 1 -; RV32IFD-NEXT: or a0, a0, a1 +; RV32IFD-NEXT: addi a4, a4, -1 +; RV32IFD-NEXT: or a0, a1, a0 ; RV32IFD-NEXT: seqz a0, a0 -; RV32IFD-NEXT: addi a0, a0, -1 -; RV32IFD-NEXT: and a0, a0, a4 +; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: neg a1, a0 -; RV32IFD-NEXT: and a0, a1, a3 -; RV32IFD-NEXT: and a1, a1, a2 +; RV32IFD-NEXT: and a0, a1, a2 +; RV32IFD-NEXT: and a1, a1, a3 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -1440,24 +1440,24 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lui a3, 524288 -; RV32-NEXT: addi a5, a3, -1 +; RV32-NEXT: lw a3, 16(sp) +; RV32-NEXT: lw a4, 20(sp) +; RV32-NEXT: lui a0, 524288 +; RV32-NEXT: addi a5, a0, -1 ; RV32-NEXT: beq a1, a5, .LBB21_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: bnez a7, .LBB21_3 ; RV32-NEXT: j .LBB21_4 ; RV32-NEXT: .LBB21_2: -; RV32-NEXT: sltiu a6, a4, -1 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: beqz a7, .LBB21_4 ; RV32-NEXT: .LBB21_3: # %entry -; RV32-NEXT: slti a6, a0, 0 +; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB21_4: # %entry ; RV32-NEXT: neg a7, a6 ; RV32-NEXT: addi t0, a6, -1 @@ -1465,21 +1465,21 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB21_6: # %entry -; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a3, .LBB21_8 +; RV32-NEXT: or a2, t0, a2 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a3, a7, a3 +; RV32-NEXT: beq a1, a0, .LBB21_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: j .LBB21_9 ; RV32-NEXT: .LBB21_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB21_9: # %entry -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB21_11 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: li a5, -1 +; RV32-NEXT: beq a3, a5, .LBB21_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 +; RV32-NEXT: slti a0, a4, 0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: .LBB21_11: # %entry ; RV32-NEXT: bnez a0, .LBB21_13 @@ -1487,7 +1487,7 @@ define i64 @stest_f32i64(float %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB21_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -1521,18 +1521,18 @@ define i64 @utest_f32i64(float %x) { ; RV32-NEXT: call __fixunssfti@plt ; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: xori a4, a0, 1 +; RV32-NEXT: or a4, a4, a1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a4, a0 ; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -1657,24 +1657,24 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: call __extendhfsf2@plt ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lui a3, 524288 -; RV32-NEXT: addi a5, a3, -1 +; RV32-NEXT: lw a3, 16(sp) +; RV32-NEXT: lw a4, 20(sp) +; RV32-NEXT: lui a0, 524288 +; RV32-NEXT: addi a5, a0, -1 ; RV32-NEXT: beq a1, a5, .LBB24_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: bnez a7, .LBB24_3 ; RV32-NEXT: j .LBB24_4 ; RV32-NEXT: .LBB24_2: -; RV32-NEXT: sltiu a6, a4, -1 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: beqz a7, .LBB24_4 ; RV32-NEXT: .LBB24_3: # %entry -; RV32-NEXT: slti a6, a0, 0 +; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB24_4: # %entry ; RV32-NEXT: neg a7, a6 ; RV32-NEXT: addi t0, a6, -1 @@ -1682,21 +1682,21 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB24_6: # %entry -; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a3, .LBB24_8 +; RV32-NEXT: or a2, t0, a2 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a3, a7, a3 +; RV32-NEXT: beq a1, a0, .LBB24_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: j .LBB24_9 ; RV32-NEXT: .LBB24_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB24_9: # %entry -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB24_11 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: li a5, -1 +; RV32-NEXT: beq a3, a5, .LBB24_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 +; RV32-NEXT: slti a0, a4, 0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: .LBB24_11: # %entry ; RV32-NEXT: bnez a0, .LBB24_13 @@ -1704,7 +1704,7 @@ define i64 @stest_f16i64(half %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB24_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -1770,18 +1770,18 @@ define i64 @utesth_f16i64(half %x) { ; RV32-NEXT: call __fixunssfti@plt ; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: xori a4, a0, 1 +; RV32-NEXT: or a4, a4, a1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a4, a0 ; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -2892,24 +2892,24 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti@plt -; RV32IF-NEXT: lw a0, 20(sp) -; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a2, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a4, 8(sp) -; RV32IF-NEXT: lui a3, 524288 -; RV32IF-NEXT: addi a5, a3, -1 +; RV32IF-NEXT: lw a3, 16(sp) +; RV32IF-NEXT: lw a4, 20(sp) +; RV32IF-NEXT: lui a0, 524288 +; RV32IF-NEXT: addi a5, a0, -1 ; RV32IF-NEXT: beq a1, a5, .LBB45_2 ; RV32IF-NEXT: # %bb.1: # %entry ; RV32IF-NEXT: sltu a6, a1, a5 -; RV32IF-NEXT: or a7, a2, a0 +; RV32IF-NEXT: or a7, a3, a4 ; RV32IF-NEXT: bnez a7, .LBB45_3 ; RV32IF-NEXT: j .LBB45_4 ; RV32IF-NEXT: .LBB45_2: -; RV32IF-NEXT: sltiu a6, a4, -1 -; RV32IF-NEXT: or a7, a2, a0 +; RV32IF-NEXT: sltiu a6, a2, -1 +; RV32IF-NEXT: or a7, a3, a4 ; RV32IF-NEXT: beqz a7, .LBB45_4 ; RV32IF-NEXT: .LBB45_3: # %entry -; RV32IF-NEXT: slti a6, a0, 0 +; RV32IF-NEXT: slti a6, a4, 0 ; RV32IF-NEXT: .LBB45_4: # %entry ; RV32IF-NEXT: neg a7, a6 ; RV32IF-NEXT: addi t0, a6, -1 @@ -2917,21 +2917,21 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: # %bb.5: # %entry ; RV32IF-NEXT: mv a1, a5 ; RV32IF-NEXT: .LBB45_6: # %entry -; RV32IF-NEXT: or a4, t0, a4 -; RV32IF-NEXT: and a5, a7, a0 -; RV32IF-NEXT: and a2, a7, a2 -; RV32IF-NEXT: beq a1, a3, .LBB45_8 +; RV32IF-NEXT: or a2, t0, a2 +; RV32IF-NEXT: and a4, a7, a4 +; RV32IF-NEXT: and a3, a7, a3 +; RV32IF-NEXT: beq a1, a0, .LBB45_8 ; RV32IF-NEXT: # %bb.7: # %entry -; RV32IF-NEXT: sltu a0, a3, a1 +; RV32IF-NEXT: sltu a0, a0, a1 ; RV32IF-NEXT: j .LBB45_9 ; RV32IF-NEXT: .LBB45_8: -; RV32IF-NEXT: snez a0, a4 +; RV32IF-NEXT: snez a0, a2 ; RV32IF-NEXT: .LBB45_9: # %entry -; RV32IF-NEXT: and a2, a2, a5 -; RV32IF-NEXT: li a3, -1 -; RV32IF-NEXT: beq a2, a3, .LBB45_11 +; RV32IF-NEXT: and a3, a3, a4 +; RV32IF-NEXT: li a5, -1 +; RV32IF-NEXT: beq a3, a5, .LBB45_11 ; RV32IF-NEXT: # %bb.10: # %entry -; RV32IF-NEXT: slti a0, a5, 0 +; RV32IF-NEXT: slti a0, a4, 0 ; RV32IF-NEXT: xori a0, a0, 1 ; RV32IF-NEXT: .LBB45_11: # %entry ; RV32IF-NEXT: bnez a0, .LBB45_13 @@ -2939,7 +2939,7 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IF-NEXT: lui a1, 524288 ; RV32IF-NEXT: .LBB45_13: # %entry ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: and a0, a0, a4 +; RV32IF-NEXT: and a0, a0, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -2991,24 +2991,24 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti@plt -; RV32IFD-NEXT: lw a0, 20(sp) -; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a2, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a4, 8(sp) -; RV32IFD-NEXT: lui a3, 524288 -; RV32IFD-NEXT: addi a5, a3, -1 +; RV32IFD-NEXT: lw a3, 16(sp) +; RV32IFD-NEXT: lw a4, 20(sp) +; RV32IFD-NEXT: lui a0, 524288 +; RV32IFD-NEXT: addi a5, a0, -1 ; RV32IFD-NEXT: beq a1, a5, .LBB45_2 ; RV32IFD-NEXT: # %bb.1: # %entry ; RV32IFD-NEXT: sltu a6, a1, a5 -; RV32IFD-NEXT: or a7, a2, a0 +; RV32IFD-NEXT: or a7, a3, a4 ; RV32IFD-NEXT: bnez a7, .LBB45_3 ; RV32IFD-NEXT: j .LBB45_4 ; RV32IFD-NEXT: .LBB45_2: -; RV32IFD-NEXT: sltiu a6, a4, -1 -; RV32IFD-NEXT: or a7, a2, a0 +; RV32IFD-NEXT: sltiu a6, a2, -1 +; RV32IFD-NEXT: or a7, a3, a4 ; RV32IFD-NEXT: beqz a7, .LBB45_4 ; RV32IFD-NEXT: .LBB45_3: # %entry -; RV32IFD-NEXT: slti a6, a0, 0 +; RV32IFD-NEXT: slti a6, a4, 0 ; RV32IFD-NEXT: .LBB45_4: # %entry ; RV32IFD-NEXT: neg a7, a6 ; RV32IFD-NEXT: addi t0, a6, -1 @@ -3016,21 +3016,21 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: # %bb.5: # %entry ; RV32IFD-NEXT: mv a1, a5 ; RV32IFD-NEXT: .LBB45_6: # %entry -; RV32IFD-NEXT: or a4, t0, a4 -; RV32IFD-NEXT: and a5, a7, a0 -; RV32IFD-NEXT: and a2, a7, a2 -; RV32IFD-NEXT: beq a1, a3, .LBB45_8 +; RV32IFD-NEXT: or a2, t0, a2 +; RV32IFD-NEXT: and a4, a7, a4 +; RV32IFD-NEXT: and a3, a7, a3 +; RV32IFD-NEXT: beq a1, a0, .LBB45_8 ; RV32IFD-NEXT: # %bb.7: # %entry -; RV32IFD-NEXT: sltu a0, a3, a1 +; RV32IFD-NEXT: sltu a0, a0, a1 ; RV32IFD-NEXT: j .LBB45_9 ; RV32IFD-NEXT: .LBB45_8: -; RV32IFD-NEXT: snez a0, a4 +; RV32IFD-NEXT: snez a0, a2 ; RV32IFD-NEXT: .LBB45_9: # %entry -; RV32IFD-NEXT: and a2, a2, a5 -; RV32IFD-NEXT: li a3, -1 -; RV32IFD-NEXT: beq a2, a3, .LBB45_11 +; RV32IFD-NEXT: and a3, a3, a4 +; RV32IFD-NEXT: li a5, -1 +; RV32IFD-NEXT: beq a3, a5, .LBB45_11 ; RV32IFD-NEXT: # %bb.10: # %entry -; RV32IFD-NEXT: slti a0, a5, 0 +; RV32IFD-NEXT: slti a0, a4, 0 ; RV32IFD-NEXT: xori a0, a0, 1 ; RV32IFD-NEXT: .LBB45_11: # %entry ; RV32IFD-NEXT: bnez a0, .LBB45_13 @@ -3038,7 +3038,7 @@ define i64 @stest_f64i64_mm(double %x) { ; RV32IFD-NEXT: lui a1, 524288 ; RV32IFD-NEXT: .LBB45_13: # %entry ; RV32IFD-NEXT: neg a0, a0 -; RV32IFD-NEXT: and a0, a0, a4 +; RV32IFD-NEXT: and a0, a0, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -3072,18 +3072,18 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IF-NEXT: call __fixunsdfti@plt ; RV32IF-NEXT: lw a0, 16(sp) ; RV32IF-NEXT: lw a1, 20(sp) -; RV32IF-NEXT: lw a2, 12(sp) -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 +; RV32IF-NEXT: lw a2, 8(sp) +; RV32IF-NEXT: lw a3, 12(sp) +; RV32IF-NEXT: xori a4, a0, 1 +; RV32IF-NEXT: or a4, a4, a1 ; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: xori a0, a0, 1 -; RV32IF-NEXT: or a0, a0, a1 +; RV32IF-NEXT: addi a4, a4, -1 +; RV32IF-NEXT: or a0, a1, a0 ; RV32IF-NEXT: seqz a0, a0 -; RV32IF-NEXT: addi a0, a0, -1 -; RV32IF-NEXT: and a0, a0, a4 +; RV32IF-NEXT: and a0, a4, a0 ; RV32IF-NEXT: neg a1, a0 -; RV32IF-NEXT: and a0, a1, a3 -; RV32IF-NEXT: and a1, a1, a2 +; RV32IF-NEXT: and a0, a1, a2 +; RV32IF-NEXT: and a1, a1, a3 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -3112,18 +3112,18 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IFD-NEXT: call __fixunsdfti@plt ; RV32IFD-NEXT: lw a0, 16(sp) ; RV32IFD-NEXT: lw a1, 20(sp) -; RV32IFD-NEXT: lw a2, 12(sp) -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 +; RV32IFD-NEXT: lw a2, 8(sp) +; RV32IFD-NEXT: lw a3, 12(sp) +; RV32IFD-NEXT: xori a4, a0, 1 +; RV32IFD-NEXT: or a4, a4, a1 ; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: xori a0, a0, 1 -; RV32IFD-NEXT: or a0, a0, a1 +; RV32IFD-NEXT: addi a4, a4, -1 +; RV32IFD-NEXT: or a0, a1, a0 ; RV32IFD-NEXT: seqz a0, a0 -; RV32IFD-NEXT: addi a0, a0, -1 -; RV32IFD-NEXT: and a0, a0, a4 +; RV32IFD-NEXT: and a0, a4, a0 ; RV32IFD-NEXT: neg a1, a0 -; RV32IFD-NEXT: and a0, a1, a3 -; RV32IFD-NEXT: and a1, a1, a2 +; RV32IFD-NEXT: and a0, a1, a2 +; RV32IFD-NEXT: and a1, a1, a3 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -3145,30 +3145,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti@plt -; RV32IF-NEXT: lw a0, 8(sp) -; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a2, 20(sp) +; RV32IF-NEXT: lw a0, 20(sp) +; RV32IF-NEXT: lw a1, 8(sp) +; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a2, .LBB47_2 +; RV32IF-NEXT: beqz a0, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a2, 0 +; RV32IF-NEXT: slti a4, a0, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a2 +; RV32IF-NEXT: or a3, a3, a0 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 +; RV32IF-NEXT: and a2, a3, a2 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: and a2, a3, a2 -; RV32IF-NEXT: slti a2, a2, 0 -; RV32IF-NEXT: addi a2, a2, -1 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: and a1, a2, a1 +; RV32IF-NEXT: slti a0, a0, 0 +; RV32IF-NEXT: addi a3, a0, -1 +; RV32IF-NEXT: and a0, a3, a1 +; RV32IF-NEXT: and a1, a3, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -3203,30 +3203,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti@plt -; RV32IFD-NEXT: lw a0, 8(sp) -; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a2, 20(sp) +; RV32IFD-NEXT: lw a0, 20(sp) +; RV32IFD-NEXT: lw a1, 8(sp) +; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a2, .LBB47_2 +; RV32IFD-NEXT: beqz a0, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a2, 0 +; RV32IFD-NEXT: slti a4, a0, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a2 +; RV32IFD-NEXT: or a3, a3, a0 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a2, a3, a2 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: and a2, a3, a2 -; RV32IFD-NEXT: slti a2, a2, 0 -; RV32IFD-NEXT: addi a2, a2, -1 -; RV32IFD-NEXT: and a0, a2, a0 -; RV32IFD-NEXT: and a1, a2, a1 +; RV32IFD-NEXT: slti a0, a0, 0 +; RV32IFD-NEXT: addi a3, a0, -1 +; RV32IFD-NEXT: and a0, a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -3247,24 +3247,24 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lui a3, 524288 -; RV32-NEXT: addi a5, a3, -1 +; RV32-NEXT: lw a3, 16(sp) +; RV32-NEXT: lw a4, 20(sp) +; RV32-NEXT: lui a0, 524288 +; RV32-NEXT: addi a5, a0, -1 ; RV32-NEXT: beq a1, a5, .LBB48_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: bnez a7, .LBB48_3 ; RV32-NEXT: j .LBB48_4 ; RV32-NEXT: .LBB48_2: -; RV32-NEXT: sltiu a6, a4, -1 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: beqz a7, .LBB48_4 ; RV32-NEXT: .LBB48_3: # %entry -; RV32-NEXT: slti a6, a0, 0 +; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB48_4: # %entry ; RV32-NEXT: neg a7, a6 ; RV32-NEXT: addi t0, a6, -1 @@ -3272,21 +3272,21 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB48_6: # %entry -; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a3, .LBB48_8 +; RV32-NEXT: or a2, t0, a2 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a3, a7, a3 +; RV32-NEXT: beq a1, a0, .LBB48_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: j .LBB48_9 ; RV32-NEXT: .LBB48_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB48_9: # %entry -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB48_11 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: li a5, -1 +; RV32-NEXT: beq a3, a5, .LBB48_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 +; RV32-NEXT: slti a0, a4, 0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: .LBB48_11: # %entry ; RV32-NEXT: bnez a0, .LBB48_13 @@ -3294,7 +3294,7 @@ define i64 @stest_f32i64_mm(float %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB48_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3326,18 +3326,18 @@ define i64 @utest_f32i64_mm(float %x) { ; RV32-NEXT: call __fixunssfti@plt ; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: xori a4, a0, 1 +; RV32-NEXT: or a4, a4, a1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a4, a0 ; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3371,30 +3371,30 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a0, 20(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB50_2 +; RV32-NEXT: beqz a0, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a0, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a0 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 +; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: and a0, a3, a1 +; RV32-NEXT: and a1, a3, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3438,24 +3438,24 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2@plt ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a2, 8(sp) ; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lui a3, 524288 -; RV32-NEXT: addi a5, a3, -1 +; RV32-NEXT: lw a3, 16(sp) +; RV32-NEXT: lw a4, 20(sp) +; RV32-NEXT: lui a0, 524288 +; RV32-NEXT: addi a5, a0, -1 ; RV32-NEXT: beq a1, a5, .LBB51_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a6, a1, a5 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: bnez a7, .LBB51_3 ; RV32-NEXT: j .LBB51_4 ; RV32-NEXT: .LBB51_2: -; RV32-NEXT: sltiu a6, a4, -1 -; RV32-NEXT: or a7, a2, a0 +; RV32-NEXT: sltiu a6, a2, -1 +; RV32-NEXT: or a7, a3, a4 ; RV32-NEXT: beqz a7, .LBB51_4 ; RV32-NEXT: .LBB51_3: # %entry -; RV32-NEXT: slti a6, a0, 0 +; RV32-NEXT: slti a6, a4, 0 ; RV32-NEXT: .LBB51_4: # %entry ; RV32-NEXT: neg a7, a6 ; RV32-NEXT: addi t0, a6, -1 @@ -3463,21 +3463,21 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: # %bb.5: # %entry ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB51_6: # %entry -; RV32-NEXT: or a4, t0, a4 -; RV32-NEXT: and a5, a7, a0 -; RV32-NEXT: and a2, a7, a2 -; RV32-NEXT: beq a1, a3, .LBB51_8 +; RV32-NEXT: or a2, t0, a2 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: and a3, a7, a3 +; RV32-NEXT: beq a1, a0, .LBB51_8 ; RV32-NEXT: # %bb.7: # %entry -; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: j .LBB51_9 ; RV32-NEXT: .LBB51_8: -; RV32-NEXT: snez a0, a4 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: .LBB51_9: # %entry -; RV32-NEXT: and a2, a2, a5 -; RV32-NEXT: li a3, -1 -; RV32-NEXT: beq a2, a3, .LBB51_11 +; RV32-NEXT: and a3, a3, a4 +; RV32-NEXT: li a5, -1 +; RV32-NEXT: beq a3, a5, .LBB51_11 ; RV32-NEXT: # %bb.10: # %entry -; RV32-NEXT: slti a0, a5, 0 +; RV32-NEXT: slti a0, a4, 0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: .LBB51_11: # %entry ; RV32-NEXT: bnez a0, .LBB51_13 @@ -3485,7 +3485,7 @@ define i64 @stest_f16i64_mm(half %x) { ; RV32-NEXT: lui a1, 524288 ; RV32-NEXT: .LBB51_13: # %entry ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3549,18 +3549,18 @@ define i64 @utesth_f16i64_mm(half %x) { ; RV32-NEXT: call __fixunssfti@plt ; RV32-NEXT: lw a0, 16(sp) ; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 +; RV32-NEXT: lw a2, 8(sp) +; RV32-NEXT: lw a3, 12(sp) +; RV32-NEXT: xori a4, a0, 1 +; RV32-NEXT: or a4, a4, a1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a0, a4, a0 ; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: and a0, a1, a2 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3596,30 +3596,30 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2@plt ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti@plt -; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a0, 20(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB53_2 +; RV32-NEXT: beqz a0, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a0, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a0 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 +; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: and a0, a3, a1 +; RV32-NEXT: and a1, a3, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index cb64e24128b5e..81f6890a6b141 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) { define i128 @abs128(i128 %x) { ; RV32I-LABEL: abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB8_2 +; RV32I-NEXT: bgez a4, .LBB8_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: or a6, a4, a3 -; RV32I-NEXT: snez a6, a6 -; RV32I-NEXT: sltu a7, a5, a6 -; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: snez a5, a1 +; RV32I-NEXT: add a4, a4, a5 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, a7 -; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: snez a5, a4 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: or a5, a3, a2 +; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: sltu a6, a1, a5 ; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: snez a6, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB8_2: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: lw a3, 4(a1) -; RV32ZBB-NEXT: lw a4, 0(a1) +; RV32ZBB-NEXT: lw a4, 12(a1) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a2, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB8_2 +; RV32ZBB-NEXT: bgez a4, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: or a6, a4, a3 -; RV32ZBB-NEXT: snez a6, a6 -; RV32ZBB-NEXT: sltu a7, a5, a6 -; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: snez a5, a1 +; RV32ZBB-NEXT: add a4, a4, a5 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, a7 -; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: snez a5, a4 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: sub a3, a3, a5 +; RV32ZBB-NEXT: or a5, a3, a2 +; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: sltu a6, a1, a5 ; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: snez a6, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a1, a1, a5 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB8_2: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: abs128: @@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) { define i128 @select_abs128(i128 %x) { ; RV32I-LABEL: select_abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB9_2 +; RV32I-NEXT: bgez a4, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: or a6, a4, a3 -; RV32I-NEXT: snez a6, a6 -; RV32I-NEXT: sltu a7, a5, a6 -; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: snez a5, a1 +; RV32I-NEXT: add a4, a4, a5 ; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, a7 -; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: snez a5, a4 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: or a5, a3, a2 +; RV32I-NEXT: snez a5, a5 +; RV32I-NEXT: sltu a6, a1, a5 ; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: snez a6, a3 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: select_abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: lw a3, 4(a1) -; RV32ZBB-NEXT: lw a4, 0(a1) +; RV32ZBB-NEXT: lw a4, 12(a1) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a2, 4(a1) ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB9_2 +; RV32ZBB-NEXT: bgez a4, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: or a6, a4, a3 -; RV32ZBB-NEXT: snez a6, a6 -; RV32ZBB-NEXT: sltu a7, a5, a6 -; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: snez a5, a1 +; RV32ZBB-NEXT: add a4, a4, a5 ; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, a7 -; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: snez a5, a4 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: sub a3, a3, a5 +; RV32ZBB-NEXT: or a5, a3, a2 +; RV32ZBB-NEXT: snez a5, a5 +; RV32ZBB-NEXT: sltu a6, a1, a5 ; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: snez a6, a3 +; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a1, a1, a5 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: select_abs128: diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll index 15abc9b75883c..5b296274e552f 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll @@ -9,8 +9,8 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; RV32: # %bb.0: ; RV32-NEXT: lw a3, 0(a0) ; RV32-NEXT: lw a1, 4(a0) -; RV32-NEXT: lw a2, 12(a0) ; RV32-NEXT: lw a4, 8(a0) +; RV32-NEXT: lw a2, 12(a0) ; RV32-NEXT: seqz a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: andi a0, a0, 4 @@ -42,8 +42,8 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; RV64: # %bb.0: ; RV64-NEXT: lw a3, 0(a0) ; RV64-NEXT: lw a1, 8(a0) -; RV64-NEXT: lw a2, 24(a0) ; RV64-NEXT: lw a4, 16(a0) +; RV64-NEXT: lw a2, 24(a0) ; RV64-NEXT: seqz a0, a3 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: andi a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll index 13d03c5217fb1..591abbc767e2d 100644 --- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll +++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll @@ -56,15 +56,15 @@ entry: define void @test3(ptr %a, ptr %b) nounwind { ; RV32-LABEL: test3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a2, 8(a1) ; RV32-NEXT: lw a3, 12(a1) -; RV32-NEXT: lw a4, 8(a1) -; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a4, 0(a1) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: lui a5, 524288 ; RV32-NEXT: xor a3, a3, a5 -; RV32-NEXT: sw a4, 8(a0) -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: sw a4, 0(a0) +; RV32-NEXT: sw a1, 4(a0) ; RV32-NEXT: sw a3, 12(a0) ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll index bfac15e009f00..22199eedc231c 100644 --- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll +++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll @@ -222,32 +222,32 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; RV64IFD-NEXT: .cfi_offset s1, -24 ; RV64IFD-NEXT: .cfi_offset s2, -32 ; RV64IFD-NEXT: .cfi_offset fs0, -40 -; RV64IFD-NEXT: lhu s1, 16(a1) -; RV64IFD-NEXT: lhu s2, 0(a1) -; RV64IFD-NEXT: lhu a1, 8(a1) +; RV64IFD-NEXT: lhu s1, 0(a1) +; RV64IFD-NEXT: lhu a2, 8(a1) +; RV64IFD-NEXT: lhu s2, 16(a1) ; RV64IFD-NEXT: mv s0, a0 -; RV64IFD-NEXT: fmv.w.x fa0, a1 +; RV64IFD-NEXT: fmv.w.x fa0, a2 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.s fs0, fa0 -; RV64IFD-NEXT: fmv.w.x fa0, s2 +; RV64IFD-NEXT: fmv.w.x fa0, s1 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: fmv.x.w a0, fs0 -; RV64IFD-NEXT: slli s2, a0, 16 +; RV64IFD-NEXT: slli s1, a0, 16 ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.x.w a0, fa0 ; RV64IFD-NEXT: slli a0, a0, 48 ; RV64IFD-NEXT: srli a0, a0, 48 -; RV64IFD-NEXT: or s2, a0, s2 -; RV64IFD-NEXT: fmv.w.x fa0, s1 +; RV64IFD-NEXT: or s1, a0, s1 +; RV64IFD-NEXT: fmv.w.x fa0, s2 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.x.w a0, fa0 ; RV64IFD-NEXT: sh a0, 4(s0) -; RV64IFD-NEXT: sw s2, 0(s0) +; RV64IFD-NEXT: sw s1, 0(s0) ; RV64IFD-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64IFD-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64IFD-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -349,27 +349,27 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; RV64IFD-NEXT: .cfi_offset fs0, -48 ; RV64IFD-NEXT: .cfi_offset fs1, -56 ; RV64IFD-NEXT: .cfi_offset fs2, -64 -; RV64IFD-NEXT: lhu s1, 24(a1) -; RV64IFD-NEXT: lhu s2, 0(a1) -; RV64IFD-NEXT: lhu s3, 8(a1) -; RV64IFD-NEXT: lhu a1, 16(a1) +; RV64IFD-NEXT: lhu s1, 0(a1) +; RV64IFD-NEXT: lhu s2, 8(a1) +; RV64IFD-NEXT: lhu a2, 16(a1) +; RV64IFD-NEXT: lhu s3, 24(a1) ; RV64IFD-NEXT: mv s0, a0 -; RV64IFD-NEXT: fmv.w.x fa0, a1 +; RV64IFD-NEXT: fmv.w.x fa0, a2 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.s fs0, fa0 -; RV64IFD-NEXT: fmv.w.x fa0, s3 +; RV64IFD-NEXT: fmv.w.x fa0, s2 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.s fs1, fa0 -; RV64IFD-NEXT: fmv.w.x fa0, s2 +; RV64IFD-NEXT: fmv.w.x fa0, s1 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: call __truncsfhf2@plt ; RV64IFD-NEXT: fmv.s fs2, fa0 -; RV64IFD-NEXT: fmv.w.x fa0, s1 +; RV64IFD-NEXT: fmv.w.x fa0, s3 ; RV64IFD-NEXT: call __extendhfsf2@plt ; RV64IFD-NEXT: call exp10f@plt ; RV64IFD-NEXT: fmv.x.w s1, fs2 diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll index 94b9444dfaf8c..fa80f1f3c48d2 100644 --- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll +++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll @@ -738,25 +738,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 12(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 4(a1) ; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw s0, 4(a1) +; RV32I-NEXT: lw s1, 8(a1) +; RV32I-NEXT: lw s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call frexpf@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: addi a1, sp, 20 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: lw a1, 8(sp) ; RV32I-NEXT: lw a2, 12(sp) @@ -764,7 +764,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32I-NEXT: lw a4, 20(sp) ; RV32I-NEXT: sw a0, 12(s3) ; RV32I-NEXT: sw s1, 8(s3) -; RV32I-NEXT: sw s2, 4(s3) +; RV32I-NEXT: sw s0, 4(s3) ; RV32I-NEXT: sw s4, 0(s3) ; RV32I-NEXT: sw a4, 28(s3) ; RV32I-NEXT: sw a3, 24(s3) @@ -788,25 +788,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw s0, 24(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 8(a1) ; RV64I-NEXT: lw a2, 0(a1) +; RV64I-NEXT: lw s0, 8(a1) +; RV64I-NEXT: lw s1, 16(a1) +; RV64I-NEXT: lw s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call frexpf@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: lw a1, 0(sp) ; RV64I-NEXT: lw a2, 4(sp) @@ -814,7 +814,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64I-NEXT: lw a4, 12(sp) ; RV64I-NEXT: sw a0, 12(s3) ; RV64I-NEXT: sw s1, 8(s3) -; RV64I-NEXT: sw s2, 4(s3) +; RV64I-NEXT: sw s0, 4(s3) ; RV64I-NEXT: sw s4, 0(s3) ; RV64I-NEXT: sw a4, 28(s3) ; RV64I-NEXT: sw a3, 24(s3) @@ -1006,29 +1006,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 12(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 4(a1) ; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw s0, 4(a1) +; RV32I-NEXT: lw s1, 8(a1) +; RV32I-NEXT: lw s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call frexpf@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: addi a1, sp, 20 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: sw a0, 12(s3) ; RV32I-NEXT: sw s1, 8(s3) -; RV32I-NEXT: sw s2, 4(s3) +; RV32I-NEXT: sw s0, 4(s3) ; RV32I-NEXT: sw s4, 0(s3) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload @@ -1048,29 +1048,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw s0, 24(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 8(a1) ; RV64I-NEXT: lw a2, 0(a1) +; RV64I-NEXT: lw s0, 8(a1) +; RV64I-NEXT: lw s1, 16(a1) +; RV64I-NEXT: lw s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call frexpf@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: sw a0, 12(s3) ; RV64I-NEXT: sw s1, 8(s3) -; RV64I-NEXT: sw s2, 4(s3) +; RV64I-NEXT: sw s0, 4(s3) ; RV64I-NEXT: sw s4, 0(s3) ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -1254,22 +1254,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 12(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 4(a1) ; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw s0, 4(a1) +; RV32I-NEXT: lw s1, 8(a1) +; RV32I-NEXT: lw s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: addi a1, sp, 12 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: addi a1, sp, 16 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: addi a1, sp, 20 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: addi a1, sp, 24 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf@plt ; RV32I-NEXT: lw a0, 24(sp) ; RV32I-NEXT: lw a1, 20(sp) @@ -1295,22 +1295,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw s0, 24(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 8(a1) ; RV64I-NEXT: lw a2, 0(a1) +; RV64I-NEXT: lw s0, 8(a1) +; RV64I-NEXT: lw s1, 16(a1) +; RV64I-NEXT: lw s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: addi a1, sp, 16 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: addi a1, sp, 20 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf@plt ; RV64I-NEXT: lw a0, 20(sp) ; RV64I-NEXT: lw a1, 16(sp) @@ -1584,16 +1584,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32IFD-NEXT: addi a2, sp, 36 ; RV32IFD-NEXT: sw a3, 0(sp) ; RV32IFD-NEXT: call frexpl@plt -; RV32IFD-NEXT: lw a0, 36(sp) +; RV32IFD-NEXT: lw a0, 24(sp) ; RV32IFD-NEXT: lw a1, 28(sp) -; RV32IFD-NEXT: lw a2, 24(sp) +; RV32IFD-NEXT: lw a2, 16(sp) ; RV32IFD-NEXT: lw a3, 20(sp) -; RV32IFD-NEXT: lw a4, 16(sp) +; RV32IFD-NEXT: lw a4, 36(sp) ; RV32IFD-NEXT: sw a1, 12(s0) -; RV32IFD-NEXT: sw a2, 8(s0) +; RV32IFD-NEXT: sw a0, 8(s0) ; RV32IFD-NEXT: sw a3, 4(s0) -; RV32IFD-NEXT: sw a4, 0(s0) -; RV32IFD-NEXT: sw a0, 16(s0) +; RV32IFD-NEXT: sw a2, 0(s0) +; RV32IFD-NEXT: sw a4, 16(s0) ; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 48 @@ -1637,16 +1637,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32IZFINXZDINX-NEXT: addi a2, sp, 36 ; RV32IZFINXZDINX-NEXT: sw a3, 0(sp) ; RV32IZFINXZDINX-NEXT: call frexpl@plt -; RV32IZFINXZDINX-NEXT: lw a0, 36(sp) +; RV32IZFINXZDINX-NEXT: lw a0, 24(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 28(sp) -; RV32IZFINXZDINX-NEXT: lw a2, 24(sp) +; RV32IZFINXZDINX-NEXT: lw a2, 16(sp) ; RV32IZFINXZDINX-NEXT: lw a3, 20(sp) -; RV32IZFINXZDINX-NEXT: lw a4, 16(sp) +; RV32IZFINXZDINX-NEXT: lw a4, 36(sp) ; RV32IZFINXZDINX-NEXT: sw a1, 12(s0) -; RV32IZFINXZDINX-NEXT: sw a2, 8(s0) +; RV32IZFINXZDINX-NEXT: sw a0, 8(s0) ; RV32IZFINXZDINX-NEXT: sw a3, 4(s0) -; RV32IZFINXZDINX-NEXT: sw a4, 0(s0) -; RV32IZFINXZDINX-NEXT: sw a0, 16(s0) +; RV32IZFINXZDINX-NEXT: sw a2, 0(s0) +; RV32IZFINXZDINX-NEXT: sw a4, 16(s0) ; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 48 @@ -1690,16 +1690,16 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32I-NEXT: addi a2, sp, 36 ; RV32I-NEXT: sw a3, 0(sp) ; RV32I-NEXT: call frexpl@plt -; RV32I-NEXT: lw a0, 36(sp) +; RV32I-NEXT: lw a0, 24(sp) ; RV32I-NEXT: lw a1, 28(sp) -; RV32I-NEXT: lw a2, 24(sp) +; RV32I-NEXT: lw a2, 16(sp) ; RV32I-NEXT: lw a3, 20(sp) -; RV32I-NEXT: lw a4, 16(sp) +; RV32I-NEXT: lw a4, 36(sp) ; RV32I-NEXT: sw a1, 12(s0) -; RV32I-NEXT: sw a2, 8(s0) +; RV32I-NEXT: sw a0, 8(s0) ; RV32I-NEXT: sw a3, 4(s0) -; RV32I-NEXT: sw a4, 0(s0) -; RV32I-NEXT: sw a0, 16(s0) +; RV32I-NEXT: sw a2, 0(s0) +; RV32I-NEXT: sw a4, 16(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll index 3718ce80142d4..8ffc302a5c13d 100644 --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -22,9 +22,8 @@ define dso_local i32 @lb(ptr %a) nounwind { define dso_local i32 @lh(ptr %a) nounwind { ; RV32I-LABEL: lh: ; RV32I: # %bb.0: -; RV32I-NEXT: lh a1, 4(a0) ; RV32I-NEXT: lh zero, 0(a0) -; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: lh a0, 4(a0) ; RV32I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 2 %2 = load i16, ptr %1 @@ -37,9 +36,8 @@ define dso_local i32 @lh(ptr %a) nounwind { define dso_local i32 @lw(ptr %a) nounwind { ; RV32I-LABEL: lw: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a1, 12(a0) ; RV32I-NEXT: lw zero, 0(a0) -; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: lw a0, 12(a0) ; RV32I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 3 %2 = load i32, ptr %1 @@ -50,9 +48,9 @@ define dso_local i32 @lw(ptr %a) nounwind { define dso_local i32 @lbu(ptr %a) nounwind { ; RV32I-LABEL: lbu: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a1, 4(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: ret %1 = getelementptr i8, ptr %a, i32 4 %2 = load i8, ptr %1 @@ -66,9 +64,9 @@ define dso_local i32 @lbu(ptr %a) nounwind { define dso_local i32 @lhu(ptr %a) nounwind { ; RV32I-LABEL: lhu: ; RV32I: # %bb.0: -; RV32I-NEXT: lhu a1, 10(a0) -; RV32I-NEXT: lhu a0, 0(a0) -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 10(a0) +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 5 %2 = load i16, ptr %1 @@ -121,10 +119,10 @@ define dso_local void @sw(ptr %a, i32 %b) nounwind { define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV32I-LABEL: load_sext_zext_anyext_i1: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 2(a0) ; RV32I-NEXT: lbu zero, 0(a0) -; RV32I-NEXT: sub a0, a2, a1 +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: lbu a0, 2(a0) +; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 @@ -143,10 +141,10 @@ define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind { define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV32I-LABEL: load_sext_zext_anyext_i1_i16: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 2(a0) ; RV32I-NEXT: lbu zero, 0(a0) -; RV32I-NEXT: sub a0, a2, a1 +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: lbu a0, 2(a0) +; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll index 09b04535498c5..befef07476d04 100644 --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -22,9 +22,8 @@ define dso_local i64 @lb(ptr %a) nounwind { define dso_local i64 @lh(ptr %a) nounwind { ; RV64I-LABEL: lh: ; RV64I: # %bb.0: -; RV64I-NEXT: lh a1, 4(a0) ; RV64I-NEXT: lh zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: lh a0, 4(a0) ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 2 %2 = load i16, ptr %1 @@ -37,9 +36,8 @@ define dso_local i64 @lh(ptr %a) nounwind { define dso_local i64 @lw(ptr %a) nounwind { ; RV64I-LABEL: lw: ; RV64I: # %bb.0: -; RV64I-NEXT: lw a1, 12(a0) ; RV64I-NEXT: lw zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: lw a0, 12(a0) ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 3 %2 = load i32, ptr %1 @@ -52,9 +50,9 @@ define dso_local i64 @lw(ptr %a) nounwind { define dso_local i64 @lbu(ptr %a) nounwind { ; RV64I-LABEL: lbu: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 4(a0) -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: lbu a0, 4(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i8, ptr %a, i32 4 %2 = load i8, ptr %1 @@ -68,9 +66,9 @@ define dso_local i64 @lbu(ptr %a) nounwind { define dso_local i64 @lhu(ptr %a) nounwind { ; RV64I-LABEL: lhu: ; RV64I: # %bb.0: -; RV64I-NEXT: lhu a1, 10(a0) -; RV64I-NEXT: lhu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 10(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 5 %2 = load i16, ptr %1 @@ -84,9 +82,9 @@ define dso_local i64 @lhu(ptr %a) nounwind { define dso_local i64 @lwu(ptr %a) nounwind { ; RV64I-LABEL: lwu: ; RV64I: # %bb.0: -; RV64I-NEXT: lwu a1, 24(a0) -; RV64I-NEXT: lwu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lwu a0, 24(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 6 %2 = load i32, ptr %1 @@ -140,9 +138,8 @@ define dso_local void @sw(ptr %a, i32 %b) nounwind { define dso_local i64 @ld(ptr %a) nounwind { ; RV64I-LABEL: ld: ; RV64I: # %bb.0: -; RV64I-NEXT: ld a1, 80(a0) ; RV64I-NEXT: ld zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: ld a0, 80(a0) ; RV64I-NEXT: ret %1 = getelementptr i64, ptr %a, i32 10 %2 = load i64, ptr %1 @@ -166,10 +163,10 @@ define dso_local void @sd(ptr %a, i64 %b) nounwind { define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV64I-LABEL: load_sext_zext_anyext_i1: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: sub a0, a2, a1 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a0, 2(a0) +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 @@ -188,10 +185,10 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV64I-LABEL: load_sext_zext_anyext_i1_i16: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: sub a0, a2, a1 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a0, 2(a0) +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll index 343695ee37da8..266afe0003728 100644 --- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll @@ -45,16 +45,16 @@ define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memcpy2: ; RV32: # %bb.0: # %entry ; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy2: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; @@ -78,20 +78,20 @@ define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memcpy3: ; RV32: # %bb.0: # %entry ; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: sb a3, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy3: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: sb a3, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; @@ -119,24 +119,24 @@ define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memcpy4: ; RV32: # %bb.0: # %entry ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; @@ -166,12 +166,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; @@ -184,28 +184,28 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -225,12 +225,12 @@ define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; @@ -245,20 +245,20 @@ define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy8: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; @@ -298,12 +298,12 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; @@ -332,32 +332,32 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 11(a1) -; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -393,12 +393,12 @@ define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; @@ -429,32 +429,32 @@ define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy16: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy16: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -520,12 +520,12 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) ; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a3, 2(a1) +; RV32-NEXT: lbu a4, 1(a1) ; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: sb a3, 2(a0) +; RV32-NEXT: sb a4, 1(a0) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; @@ -586,12 +586,12 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a3, 2(a1) +; RV64-NEXT: lbu a4, 1(a1) ; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: sb a3, 2(a0) +; RV64-NEXT: sb a4, 1(a0) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; @@ -606,24 +606,24 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: lw a2, 16(a1) ; RV32-FAST-NEXT: sw a2, 16(a0) ; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) -; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a3, 16(a1) +; RV64-FAST-NEXT: ld a4, 8(a1) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: sd a3, 16(a0) +; RV64-FAST-NEXT: sd a4, 8(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -743,16 +743,16 @@ define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: aligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: aligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -764,8 +764,8 @@ define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy8: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) ; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; @@ -787,10 +787,10 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lh a2, 12(a1) ; RV32-NEXT: sh a2, 12(a0) ; RV32-NEXT: lw a2, 8(a1) -; RV32-NEXT: sw a2, 8(a0) -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a3, 4(a1) ; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: sw a3, 4(a0) ; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; @@ -809,20 +809,20 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: aligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 11(a1) -; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: aligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -834,20 +834,20 @@ define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy16: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) -; RV32-BOTH-NEXT: sw a2, 12(a0) -; RV32-BOTH-NEXT: lw a2, 8(a1) -; RV32-BOTH-NEXT: sw a2, 8(a0) -; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a3, 8(a1) +; RV32-BOTH-NEXT: lw a4, 4(a1) ; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: sw a3, 8(a0) +; RV32-BOTH-NEXT: sw a4, 4(a0) ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy16: ; RV64-BOTH: # %bb.0: # %entry ; RV64-BOTH-NEXT: ld a2, 8(a1) -; RV64-BOTH-NEXT: sd a2, 8(a0) ; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a2, 8(a0) ; RV64-BOTH-NEXT: sd a1, 0(a0) ; RV64-BOTH-NEXT: ret entry: @@ -869,12 +869,12 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: lw a2, 16(a1) ; RV32-NEXT: sw a2, 16(a0) ; RV32-NEXT: lw a2, 12(a1) -; RV32-NEXT: sw a2, 12(a0) -; RV32-NEXT: lw a2, 8(a1) -; RV32-NEXT: sw a2, 8(a0) -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a4, 4(a1) ; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: sw a3, 8(a0) +; RV32-NEXT: sw a4, 4(a0) ; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; @@ -887,10 +887,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: lw a2, 24(a1) ; RV64-NEXT: sw a2, 24(a0) ; RV64-NEXT: ld a2, 16(a1) -; RV64-NEXT: sd a2, 16(a0) -; RV64-NEXT: ld a2, 8(a1) -; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: ld a3, 8(a1) ; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: sd a3, 8(a0) ; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; @@ -905,24 +905,24 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: lw a2, 16(a1) ; RV32-FAST-NEXT: sw a2, 16(a0) ; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: aligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) -; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a3, 16(a1) +; RV64-FAST-NEXT: ld a4, 8(a1) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: sd a3, 16(a0) +; RV64-FAST-NEXT: sd a4, 8(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -938,32 +938,32 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { ; RV32-BOTH-LABEL: memcpy16_align4: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) -; RV32-BOTH-NEXT: sw a2, 12(a0) -; RV32-BOTH-NEXT: lw a2, 8(a1) -; RV32-BOTH-NEXT: sw a2, 8(a0) -; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a3, 8(a1) +; RV32-BOTH-NEXT: lw a4, 4(a1) ; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: sw a3, 8(a0) +; RV32-BOTH-NEXT: sw a4, 4(a0) ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; ; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) -; RV64-NEXT: sw a2, 12(a0) -; RV64-NEXT: lw a2, 8(a1) -; RV64-NEXT: sw a2, 8(a0) -; RV64-NEXT: lw a2, 4(a1) -; RV64-NEXT: sw a2, 4(a0) +; RV64-NEXT: lw a3, 8(a1) +; RV64-NEXT: lw a4, 4(a1) ; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a2, 12(a0) +; RV64-NEXT: sw a3, 8(a0) +; RV64-NEXT: sw a4, 4(a0) ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; ; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -979,8 +979,8 @@ define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { ; RV32-NEXT: lh a2, 8(a1) ; RV32-NEXT: sh a2, 8(a0) ; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) ; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a2, 4(a0) ; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -999,10 +999,10 @@ define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { ; RV32-FAST-LABEL: memcpy11_align8: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 7(a1) -; RV32-FAST-NEXT: sw a2, 7(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a3, 4(a1) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: sw a3, 4(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: li a0, 0 ; RV32-FAST-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 26ad872509194..9da5e2b00b215 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -25,16 +25,16 @@ define i32 @t0() { ; RV32: # %bb.0: # %entry ; RV32-NEXT: lui a0, %hi(src) ; RV32-NEXT: lw a1, %lo(src)(a0) -; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: sw a1, %lo(dst)(a2) ; RV32-NEXT: addi a0, a0, %lo(src) -; RV32-NEXT: lbu a1, 10(a0) +; RV32-NEXT: lw a2, 4(a0) ; RV32-NEXT: lh a3, 8(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: addi a2, a2, %lo(dst) -; RV32-NEXT: sb a1, 10(a2) -; RV32-NEXT: sh a3, 8(a2) -; RV32-NEXT: sw a0, 4(a2) +; RV32-NEXT: lbu a0, 10(a0) +; RV32-NEXT: lui a4, %hi(dst) +; RV32-NEXT: sw a1, %lo(dst)(a4) +; RV32-NEXT: addi a1, a4, %lo(dst) +; RV32-NEXT: sb a0, 10(a1) +; RV32-NEXT: sh a3, 8(a1) +; RV32-NEXT: sw a2, 4(a1) ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret ; @@ -42,14 +42,14 @@ define i32 @t0() { ; RV64: # %bb.0: # %entry ; RV64-NEXT: lui a0, %hi(src) ; RV64-NEXT: ld a1, %lo(src)(a0) -; RV64-NEXT: lui a2, %hi(dst) ; RV64-NEXT: addi a0, a0, %lo(src) -; RV64-NEXT: lbu a3, 10(a0) -; RV64-NEXT: lh a0, 8(a0) -; RV64-NEXT: sd a1, %lo(dst)(a2) -; RV64-NEXT: addi a1, a2, %lo(dst) -; RV64-NEXT: sb a3, 10(a1) -; RV64-NEXT: sh a0, 8(a1) +; RV64-NEXT: lh a2, 8(a0) +; RV64-NEXT: lbu a0, 10(a0) +; RV64-NEXT: lui a3, %hi(dst) +; RV64-NEXT: sd a1, %lo(dst)(a3) +; RV64-NEXT: addi a1, a3, %lo(dst) +; RV64-NEXT: sb a0, 10(a1) +; RV64-NEXT: sh a2, 8(a1) ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret ; @@ -57,14 +57,14 @@ define i32 @t0() { ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lui a0, %hi(src) ; RV32-FAST-NEXT: lw a1, %lo(src)(a0) -; RV32-FAST-NEXT: lui a2, %hi(dst) ; RV32-FAST-NEXT: addi a0, a0, %lo(src) -; RV32-FAST-NEXT: lw a3, 7(a0) -; RV32-FAST-NEXT: lw a0, 4(a0) -; RV32-FAST-NEXT: sw a1, %lo(dst)(a2) -; RV32-FAST-NEXT: addi a1, a2, %lo(dst) -; RV32-FAST-NEXT: sw a3, 7(a1) -; RV32-FAST-NEXT: sw a0, 4(a1) +; RV32-FAST-NEXT: lw a2, 4(a0) +; RV32-FAST-NEXT: lw a0, 7(a0) +; RV32-FAST-NEXT: lui a3, %hi(dst) +; RV32-FAST-NEXT: sw a1, %lo(dst)(a3) +; RV32-FAST-NEXT: addi a1, a3, %lo(dst) +; RV32-FAST-NEXT: sw a0, 7(a1) +; RV32-FAST-NEXT: sw a2, 4(a1) ; RV32-FAST-NEXT: li a0, 0 ; RV32-FAST-NEXT: ret ; @@ -166,16 +166,16 @@ define void @t2(ptr nocapture %C) nounwind { ; RV64-FAST-NEXT: lui a1, %hi(.L.str2) ; RV64-FAST-NEXT: ld a2, %lo(.L.str2)(a1) ; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: lui a2, 1156 -; RV64-FAST-NEXT: addi a2, a2, 332 ; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: ld a3, 24(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: sw a2, 32(a0) -; RV64-FAST-NEXT: sd a3, 24(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: ld a3, 16(a1) +; RV64-FAST-NEXT: ld a1, 24(a1) +; RV64-FAST-NEXT: lui a4, 1156 +; RV64-FAST-NEXT: addi a4, a4, 332 +; RV64-FAST-NEXT: sw a4, 32(a0) +; RV64-FAST-NEXT: sd a1, 24(a0) +; RV64-FAST-NEXT: sd a3, 16(a0) +; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) @@ -409,32 +409,32 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV32-BOTH-LABEL: t7: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) -; RV32-BOTH-NEXT: sw a2, 12(a0) -; RV32-BOTH-NEXT: lw a2, 8(a1) -; RV32-BOTH-NEXT: sw a2, 8(a0) -; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a3, 8(a1) +; RV32-BOTH-NEXT: lw a4, 4(a1) ; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: sw a3, 8(a0) +; RV32-BOTH-NEXT: sw a4, 4(a0) ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; ; RV64-LABEL: t7: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) -; RV64-NEXT: sw a2, 12(a0) -; RV64-NEXT: lw a2, 8(a1) -; RV64-NEXT: sw a2, 8(a0) -; RV64-NEXT: lw a2, 4(a1) -; RV64-NEXT: sw a2, 4(a0) +; RV64-NEXT: lw a3, 8(a1) +; RV64-NEXT: lw a4, 4(a1) ; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a2, 12(a0) +; RV64-NEXT: sw a3, 8(a0) +; RV64-NEXT: sw a4, 4(a0) ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; ; RV64-FAST-LABEL: t7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll index 4eb969a357a9e..069d38460bb19 100644 --- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll +++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll @@ -1,32 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; REQUIRES: asserts ; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \ -; RUN: | FileCheck -check-prefix=NOCLUSTER %s -; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \ -; RUN: | FileCheck -check-prefix=NOCLUSTER %s -; RUN: llc -mtriple=riscv32 -riscv-misched-load-clustering -verify-misched \ -; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \ ; RUN: | FileCheck -check-prefix=LDCLUSTER %s -; RUN: llc -mtriple=riscv64 -riscv-misched-load-clustering -verify-misched \ -; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \ +; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \ ; RUN: | FileCheck -check-prefix=LDCLUSTER %s - define i32 @load_clustering_1(ptr nocapture %p) { -; NOCLUSTER: ********** MI Scheduling ********** -; NOCLUSTER-LABEL: load_clustering_1:%bb.0 -; NOCLUSTER: *** Final schedule for %bb.0 *** -; NOCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12 -; NOCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8 -; NOCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4 -; NOCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16 -; -; LDCLUSTER: ********** MI Scheduling ********** -; LDCLUSTER-LABEL: load_clustering_1:%bb.0 -; LDCLUSTER: *** Final schedule for %bb.0 *** -; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16 -; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12 -; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8 -; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4 entry: %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3 %val0 = load i32, i32* %arrayidx0 @@ -41,3 +20,5 @@ entry: %tmp2 = add i32 %tmp1, %val3 ret i32 %tmp2 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; LDCLUSTER: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index f2b7e8d26328d..dd6131e064cac 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1167,48 +1167,48 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw a2, 12(a1) -; RV32IM-NEXT: lw a3, 8(a1) -; RV32IM-NEXT: lw a4, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw a2, 0(a1) +; RV32IM-NEXT: lw a3, 4(a1) +; RV32IM-NEXT: lw a4, 8(a1) +; RV32IM-NEXT: lw a1, 12(a1) ; RV32IM-NEXT: li a5, -15 ; RV32IM-NEXT: slli a5, a5, 8 -; RV32IM-NEXT: mulhu a6, a4, a5 -; RV32IM-NEXT: mul a7, a1, a5 +; RV32IM-NEXT: mulhu a6, a2, a5 +; RV32IM-NEXT: mul a7, a3, a5 ; RV32IM-NEXT: add a6, a7, a6 ; RV32IM-NEXT: sltu a7, a6, a7 -; RV32IM-NEXT: mulhu t0, a1, a5 +; RV32IM-NEXT: mulhu t0, a3, a5 ; RV32IM-NEXT: add a7, t0, a7 -; RV32IM-NEXT: sub a6, a6, a4 -; RV32IM-NEXT: neg t0, a4 +; RV32IM-NEXT: sub a6, a6, a2 +; RV32IM-NEXT: neg t0, a2 ; RV32IM-NEXT: sltu t1, a6, t0 ; RV32IM-NEXT: li t2, -1 -; RV32IM-NEXT: mulhu t3, a4, t2 +; RV32IM-NEXT: mulhu t3, a2, t2 ; RV32IM-NEXT: add t1, t3, t1 ; RV32IM-NEXT: add t1, a7, t1 -; RV32IM-NEXT: sub t4, t1, a1 -; RV32IM-NEXT: mul t5, a3, a5 -; RV32IM-NEXT: sub t5, t5, a4 +; RV32IM-NEXT: sub t4, t1, a3 +; RV32IM-NEXT: mul t5, a4, a5 +; RV32IM-NEXT: sub t5, t5, a2 ; RV32IM-NEXT: add t6, t4, t5 ; RV32IM-NEXT: sltu s0, t6, t4 -; RV32IM-NEXT: neg s1, a1 +; RV32IM-NEXT: neg s1, a3 ; RV32IM-NEXT: sltu t4, t4, s1 ; RV32IM-NEXT: sltu a7, t1, a7 -; RV32IM-NEXT: mulhu t1, a1, t2 +; RV32IM-NEXT: mulhu t1, a3, t2 ; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 ; RV32IM-NEXT: sltu t0, t5, t0 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: mulhu t1, a3, a5 -; RV32IM-NEXT: sub a3, t1, a3 -; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: mul a1, a1, a5 +; RV32IM-NEXT: mulhu t1, a4, a5 +; RV32IM-NEXT: sub a4, t1, a4 ; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: sub a1, t3, a1 -; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a3, a2, a3 +; RV32IM-NEXT: sub a3, t3, a3 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: mul a2, a4, a5 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sw a2, 0(a0) ; RV32IM-NEXT: sw a6, 4(a0) ; RV32IM-NEXT: sw t6, 8(a0) @@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-LABEL: muli128_m63: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: slli a3, a2, 6 -; RV32I-NEXT: sltu a5, a2, a3 +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: slli a1, a2, 6 +; RV32I-NEXT: sltu a4, a2, a1 ; RV32I-NEXT: srli a7, a2, 26 -; RV32I-NEXT: slli t0, a1, 6 +; RV32I-NEXT: slli t0, a3, 6 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: mv t0, a5 -; RV32I-NEXT: beq a1, a7, .LBB31_2 +; RV32I-NEXT: mv t0, a4 +; RV32I-NEXT: beq a3, a7, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a1, a7 +; RV32I-NEXT: sltu t0, a3, a7 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: srli t1, a1, 26 +; RV32I-NEXT: srli t1, a3, 26 ; RV32I-NEXT: slli t2, a6, 6 ; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: sub t2, a6, t1 ; RV32I-NEXT: sltu t3, t2, t0 ; RV32I-NEXT: sltu t1, a6, t1 ; RV32I-NEXT: srli a6, a6, 26 -; RV32I-NEXT: slli t4, a4, 6 +; RV32I-NEXT: slli t4, a5, 6 ; RV32I-NEXT: or a6, t4, a6 -; RV32I-NEXT: sub a4, a4, a6 -; RV32I-NEXT: sub a4, a4, t1 -; RV32I-NEXT: sub a4, a4, t3 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a5, a5, t1 +; RV32I-NEXT: sub a5, a5, t3 ; RV32I-NEXT: sub a6, t2, t0 -; RV32I-NEXT: sub a1, a1, a7 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sub a2, a2, a1 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a4, 12(a0) +; RV32I-NEXT: sw a5, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m63: @@ -1292,52 +1292,52 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw a2, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a4, 4(a1) -; RV32IM-NEXT: lw a1, 8(a1) +; RV32IM-NEXT: lw a2, 0(a1) +; RV32IM-NEXT: lw a3, 4(a1) +; RV32IM-NEXT: lw a4, 8(a1) +; RV32IM-NEXT: lw a1, 12(a1) ; RV32IM-NEXT: li a5, -63 -; RV32IM-NEXT: mulhu a6, a3, a5 -; RV32IM-NEXT: slli a7, a4, 6 -; RV32IM-NEXT: sub a7, a4, a7 +; RV32IM-NEXT: mulhu a6, a2, a5 +; RV32IM-NEXT: slli a7, a3, 6 +; RV32IM-NEXT: sub a7, a3, a7 ; RV32IM-NEXT: add a6, a7, a6 ; RV32IM-NEXT: sltu a7, a6, a7 -; RV32IM-NEXT: mulhu t0, a4, a5 +; RV32IM-NEXT: mulhu t0, a3, a5 ; RV32IM-NEXT: add a7, t0, a7 -; RV32IM-NEXT: sub a6, a6, a3 -; RV32IM-NEXT: neg t0, a3 +; RV32IM-NEXT: sub a6, a6, a2 +; RV32IM-NEXT: neg t0, a2 ; RV32IM-NEXT: sltu t1, a6, t0 ; RV32IM-NEXT: li t2, -1 -; RV32IM-NEXT: mulhu t3, a3, t2 +; RV32IM-NEXT: mulhu t3, a2, t2 ; RV32IM-NEXT: add t1, t3, t1 ; RV32IM-NEXT: add t1, a7, t1 -; RV32IM-NEXT: sub t4, t1, a4 -; RV32IM-NEXT: slli t5, a1, 6 -; RV32IM-NEXT: sub t6, a1, a3 +; RV32IM-NEXT: sub t4, t1, a3 +; RV32IM-NEXT: slli t5, a4, 6 +; RV32IM-NEXT: sub t6, a4, a2 ; RV32IM-NEXT: sub t5, t6, t5 ; RV32IM-NEXT: add t6, t4, t5 ; RV32IM-NEXT: sltu s0, t6, t4 -; RV32IM-NEXT: neg s1, a4 +; RV32IM-NEXT: neg s1, a3 ; RV32IM-NEXT: sltu t4, t4, s1 ; RV32IM-NEXT: sltu a7, t1, a7 -; RV32IM-NEXT: mulhu t1, a4, t2 +; RV32IM-NEXT: mulhu t1, a3, t2 ; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 ; RV32IM-NEXT: sltu t0, t5, t0 -; RV32IM-NEXT: slli t1, a2, 6 -; RV32IM-NEXT: sub a2, a2, t1 -; RV32IM-NEXT: mulhu a5, a1, a5 -; RV32IM-NEXT: sub a5, a5, a1 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a4, a3, a4 -; RV32IM-NEXT: sub a1, t3, a4 -; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: slli t1, a1, 6 +; RV32IM-NEXT: sub a1, a1, t1 +; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: sub a5, a5, a4 +; RV32IM-NEXT: add a1, a5, a1 +; RV32IM-NEXT: add a3, a2, a3 +; RV32IM-NEXT: sub a3, t3, a3 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: slli a2, a3, 6 -; RV32IM-NEXT: sub a3, a3, a2 -; RV32IM-NEXT: sw a3, 0(a0) +; RV32IM-NEXT: slli a3, a2, 6 +; RV32IM-NEXT: sub a2, a2, a3 +; RV32IM-NEXT: sw a2, 0(a0) ; RV32IM-NEXT: sw a6, 4(a0) ; RV32IM-NEXT: sw t6, 8(a0) ; RV32IM-NEXT: sw a1, 12(a0) diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll index 4c5c36fc72d14..f2421970340b5 100644 --- a/llvm/test/CodeGen/RISCV/nontemporal.ll +++ b/llvm/test/CodeGen/RISCV/nontemporal.ll @@ -907,54 +907,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64-NEXT: .cfi_offset s0, -8 ; CHECK-RV64-NEXT: .cfi_offset s1, -16 -; CHECK-RV64-NEXT: lbu a2, 0(a1) -; CHECK-RV64-NEXT: lbu a3, 8(a1) -; CHECK-RV64-NEXT: lbu a4, 16(a1) -; CHECK-RV64-NEXT: lbu a5, 24(a1) -; CHECK-RV64-NEXT: lbu a6, 32(a1) -; CHECK-RV64-NEXT: lbu a7, 40(a1) -; CHECK-RV64-NEXT: lbu t0, 48(a1) -; CHECK-RV64-NEXT: lbu t1, 56(a1) -; CHECK-RV64-NEXT: lbu t2, 64(a1) -; CHECK-RV64-NEXT: lbu t3, 72(a1) -; CHECK-RV64-NEXT: lbu t4, 80(a1) -; CHECK-RV64-NEXT: lbu t5, 88(a1) -; CHECK-RV64-NEXT: lbu t6, 120(a1) -; CHECK-RV64-NEXT: lbu s0, 112(a1) -; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a2, 104(a1) +; CHECK-RV64-NEXT: lbu a3, 112(a1) +; CHECK-RV64-NEXT: lbu a4, 120(a1) +; CHECK-RV64-NEXT: lbu a5, 0(a1) +; CHECK-RV64-NEXT: lbu a6, 8(a1) +; CHECK-RV64-NEXT: lbu a7, 16(a1) +; CHECK-RV64-NEXT: lbu t0, 24(a1) +; CHECK-RV64-NEXT: lbu t1, 32(a1) +; CHECK-RV64-NEXT: lbu t2, 40(a1) +; CHECK-RV64-NEXT: lbu t3, 48(a1) +; CHECK-RV64-NEXT: lbu t4, 56(a1) +; CHECK-RV64-NEXT: lbu t5, 64(a1) +; CHECK-RV64-NEXT: lbu t6, 72(a1) +; CHECK-RV64-NEXT: lbu s0, 80(a1) +; CHECK-RV64-NEXT: lbu s1, 88(a1) ; CHECK-RV64-NEXT: lbu a1, 96(a1) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: sb a4, 15(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: sb a3, 14(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: sb a2, 13(a0) ; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: sb a1, 12(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: sb s1, 11(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: sb s0, 10(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: sb t6, 9(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: sb t5, 8(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: sb t4, 7(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: sb t3, 6(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: sb t2, 5(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: sb t1, 4(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: sb t0, 3(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: sb a7, 2(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: sb a6, 1(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: sb a5, 0(a0) ; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -968,54 +968,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: .cfi_offset s0, -4 ; CHECK-RV32-NEXT: .cfi_offset s1, -8 -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 4(a1) -; CHECK-RV32-NEXT: lbu a4, 8(a1) -; CHECK-RV32-NEXT: lbu a5, 12(a1) -; CHECK-RV32-NEXT: lbu a6, 16(a1) -; CHECK-RV32-NEXT: lbu a7, 20(a1) -; CHECK-RV32-NEXT: lbu t0, 24(a1) -; CHECK-RV32-NEXT: lbu t1, 28(a1) -; CHECK-RV32-NEXT: lbu t2, 32(a1) -; CHECK-RV32-NEXT: lbu t3, 36(a1) -; CHECK-RV32-NEXT: lbu t4, 40(a1) -; CHECK-RV32-NEXT: lbu t5, 44(a1) -; CHECK-RV32-NEXT: lbu t6, 60(a1) -; CHECK-RV32-NEXT: lbu s0, 56(a1) -; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a2, 52(a1) +; CHECK-RV32-NEXT: lbu a3, 56(a1) +; CHECK-RV32-NEXT: lbu a4, 60(a1) +; CHECK-RV32-NEXT: lbu a5, 0(a1) +; CHECK-RV32-NEXT: lbu a6, 4(a1) +; CHECK-RV32-NEXT: lbu a7, 8(a1) +; CHECK-RV32-NEXT: lbu t0, 12(a1) +; CHECK-RV32-NEXT: lbu t1, 16(a1) +; CHECK-RV32-NEXT: lbu t2, 20(a1) +; CHECK-RV32-NEXT: lbu t3, 24(a1) +; CHECK-RV32-NEXT: lbu t4, 28(a1) +; CHECK-RV32-NEXT: lbu t5, 32(a1) +; CHECK-RV32-NEXT: lbu t6, 36(a1) +; CHECK-RV32-NEXT: lbu s0, 40(a1) +; CHECK-RV32-NEXT: lbu s1, 44(a1) ; CHECK-RV32-NEXT: lbu a1, 48(a1) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: sb a4, 15(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: sb a3, 14(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: sb a2, 13(a0) ; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: sb a1, 12(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: sb s1, 11(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: sb s0, 10(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: sb t6, 9(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: sb t5, 8(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: sb t4, 7(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: sb t3, 6(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: sb t2, 5(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: sb t1, 4(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: sb t0, 3(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: sb a7, 2(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: sb a6, 1(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: sb a5, 0(a0) ; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -1029,44 +1029,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64C-NEXT: .cfi_offset s0, -8 ; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu t3, 104(a1) +; CHECK-RV64C-NEXT: lbu t6, 112(a1) +; CHECK-RV64C-NEXT: lbu a4, 120(a1) ; CHECK-RV64C-NEXT: lbu a6, 0(a1) ; CHECK-RV64C-NEXT: lbu a7, 8(a1) ; CHECK-RV64C-NEXT: lbu t0, 16(a1) ; CHECK-RV64C-NEXT: lbu t1, 24(a1) ; CHECK-RV64C-NEXT: lbu t2, 32(a1) -; CHECK-RV64C-NEXT: lbu t3, 40(a1) -; CHECK-RV64C-NEXT: lbu t4, 48(a1) -; CHECK-RV64C-NEXT: lbu t5, 56(a1) -; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu t4, 40(a1) +; CHECK-RV64C-NEXT: lbu t5, 48(a1) +; CHECK-RV64C-NEXT: lbu a5, 56(a1) +; CHECK-RV64C-NEXT: lbu a2, 64(a1) ; CHECK-RV64C-NEXT: lbu a3, 72(a1) -; CHECK-RV64C-NEXT: lbu a4, 80(a1) -; CHECK-RV64C-NEXT: lbu a5, 88(a1) -; CHECK-RV64C-NEXT: lbu a2, 120(a1) -; CHECK-RV64C-NEXT: lbu s0, 112(a1) -; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu s0, 80(a1) +; CHECK-RV64C-NEXT: lbu s1, 88(a1) ; CHECK-RV64C-NEXT: lbu a1, 96(a1) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: sb a4, 15(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: sb t6, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: sb t3, 13(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb a1, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: sb s1, 11(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: sb s0, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb a3, 9(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: sb a2, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: sb a5, 7(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: sb t5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: sb t4, 5(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb t2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.all @@ -1090,44 +1090,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32C-NEXT: .cfi_offset s0, -4 ; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu t3, 52(a1) +; CHECK-RV32C-NEXT: lbu t6, 56(a1) +; CHECK-RV32C-NEXT: lbu a4, 60(a1) ; CHECK-RV32C-NEXT: lbu a6, 0(a1) ; CHECK-RV32C-NEXT: lbu a7, 4(a1) ; CHECK-RV32C-NEXT: lbu t0, 8(a1) ; CHECK-RV32C-NEXT: lbu t1, 12(a1) ; CHECK-RV32C-NEXT: lbu t2, 16(a1) -; CHECK-RV32C-NEXT: lbu t3, 20(a1) -; CHECK-RV32C-NEXT: lbu t4, 24(a1) -; CHECK-RV32C-NEXT: lbu t5, 28(a1) -; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu t4, 20(a1) +; CHECK-RV32C-NEXT: lbu t5, 24(a1) +; CHECK-RV32C-NEXT: lbu a5, 28(a1) +; CHECK-RV32C-NEXT: lbu a2, 32(a1) ; CHECK-RV32C-NEXT: lbu a3, 36(a1) -; CHECK-RV32C-NEXT: lbu a4, 40(a1) -; CHECK-RV32C-NEXT: lbu a5, 44(a1) -; CHECK-RV32C-NEXT: lbu a2, 60(a1) -; CHECK-RV32C-NEXT: lbu s0, 56(a1) -; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu s0, 40(a1) +; CHECK-RV32C-NEXT: lbu s1, 44(a1) ; CHECK-RV32C-NEXT: lbu a1, 48(a1) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: sb a4, 15(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: sb t6, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: sb t3, 13(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb a1, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: sb s1, 11(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: sb s0, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb a3, 9(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: sb a2, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: sb a5, 7(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: sb t5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: sb t4, 5(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb t2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.all @@ -1163,112 +1163,112 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) { ; CHECK-RV64-LABEL: test_nontemporal_store_v8i16: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: lh a2, 0(a1) -; CHECK-RV64-NEXT: lh a3, 8(a1) -; CHECK-RV64-NEXT: lh a4, 16(a1) -; CHECK-RV64-NEXT: lh a5, 24(a1) -; CHECK-RV64-NEXT: lh a6, 56(a1) -; CHECK-RV64-NEXT: lh a7, 48(a1) -; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a2, 40(a1) +; CHECK-RV64-NEXT: lh a3, 48(a1) +; CHECK-RV64-NEXT: lh a4, 56(a1) +; CHECK-RV64-NEXT: lh a5, 0(a1) +; CHECK-RV64-NEXT: lh a6, 8(a1) +; CHECK-RV64-NEXT: lh a7, 16(a1) +; CHECK-RV64-NEXT: lh t0, 24(a1) ; CHECK-RV64-NEXT: lh a1, 32(a1) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: sh a4, 14(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: sh a3, 12(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: sh a2, 10(a0) ; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: sh a1, 8(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: sh t0, 6(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: sh a7, 4(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: sh a6, 2(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: sh a5, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_store_v8i16: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lh a2, 0(a1) -; CHECK-RV32-NEXT: lh a3, 4(a1) -; CHECK-RV32-NEXT: lh a4, 8(a1) -; CHECK-RV32-NEXT: lh a5, 12(a1) -; CHECK-RV32-NEXT: lh a6, 28(a1) -; CHECK-RV32-NEXT: lh a7, 24(a1) -; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a2, 20(a1) +; CHECK-RV32-NEXT: lh a3, 24(a1) +; CHECK-RV32-NEXT: lh a4, 28(a1) +; CHECK-RV32-NEXT: lh a5, 0(a1) +; CHECK-RV32-NEXT: lh a6, 4(a1) +; CHECK-RV32-NEXT: lh a7, 8(a1) +; CHECK-RV32-NEXT: lh t0, 12(a1) ; CHECK-RV32-NEXT: lh a1, 16(a1) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: sh a4, 14(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: sh a3, 12(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: sh a2, 10(a0) ; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: sh a1, 8(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: sh t0, 6(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: sh a7, 4(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: sh a6, 2(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: sh a5, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16: ; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a7, 40(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 56(a1) ; CHECK-RV64C-NEXT: lh a6, 0(a1) -; CHECK-RV64C-NEXT: lh a7, 8(a1) -; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh t0, 8(a1) +; CHECK-RV64C-NEXT: lh a2, 16(a1) ; CHECK-RV64C-NEXT: lh a5, 24(a1) -; CHECK-RV64C-NEXT: lh a2, 56(a1) -; CHECK-RV64C-NEXT: lh a3, 48(a1) -; CHECK-RV64C-NEXT: lh a4, 40(a1) ; CHECK-RV64C-NEXT: lh a1, 32(a1) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: sh a4, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a3, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: sh a7, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a1, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: sh a2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: sh t0, 2(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a6, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16: ; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a7, 20(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 28(a1) ; CHECK-RV32C-NEXT: lh a6, 0(a1) -; CHECK-RV32C-NEXT: lh a7, 4(a1) -; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh t0, 4(a1) +; CHECK-RV32C-NEXT: lh a2, 8(a1) ; CHECK-RV32C-NEXT: lh a5, 12(a1) -; CHECK-RV32C-NEXT: lh a2, 28(a1) -; CHECK-RV32C-NEXT: lh a3, 24(a1) -; CHECK-RV32C-NEXT: lh a4, 20(a1) ; CHECK-RV32C-NEXT: lh a1, 16(a1) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: sh a4, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a3, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: sh a7, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a1, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: sh a2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: sh t0, 2(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a6, 0(a0) ; CHECK-RV32C-NEXT: ret @@ -2321,54 +2321,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64-NEXT: .cfi_offset s0, -8 ; CHECK-RV64-NEXT: .cfi_offset s1, -16 -; CHECK-RV64-NEXT: lbu a2, 0(a1) -; CHECK-RV64-NEXT: lbu a3, 8(a1) -; CHECK-RV64-NEXT: lbu a4, 16(a1) -; CHECK-RV64-NEXT: lbu a5, 24(a1) -; CHECK-RV64-NEXT: lbu a6, 32(a1) -; CHECK-RV64-NEXT: lbu a7, 40(a1) -; CHECK-RV64-NEXT: lbu t0, 48(a1) -; CHECK-RV64-NEXT: lbu t1, 56(a1) -; CHECK-RV64-NEXT: lbu t2, 64(a1) -; CHECK-RV64-NEXT: lbu t3, 72(a1) -; CHECK-RV64-NEXT: lbu t4, 80(a1) -; CHECK-RV64-NEXT: lbu t5, 88(a1) -; CHECK-RV64-NEXT: lbu t6, 120(a1) -; CHECK-RV64-NEXT: lbu s0, 112(a1) -; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a2, 104(a1) +; CHECK-RV64-NEXT: lbu a3, 112(a1) +; CHECK-RV64-NEXT: lbu a4, 120(a1) +; CHECK-RV64-NEXT: lbu a5, 0(a1) +; CHECK-RV64-NEXT: lbu a6, 8(a1) +; CHECK-RV64-NEXT: lbu a7, 16(a1) +; CHECK-RV64-NEXT: lbu t0, 24(a1) +; CHECK-RV64-NEXT: lbu t1, 32(a1) +; CHECK-RV64-NEXT: lbu t2, 40(a1) +; CHECK-RV64-NEXT: lbu t3, 48(a1) +; CHECK-RV64-NEXT: lbu t4, 56(a1) +; CHECK-RV64-NEXT: lbu t5, 64(a1) +; CHECK-RV64-NEXT: lbu t6, 72(a1) +; CHECK-RV64-NEXT: lbu s0, 80(a1) +; CHECK-RV64-NEXT: lbu s1, 88(a1) ; CHECK-RV64-NEXT: lbu a1, 96(a1) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: sb a4, 15(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: sb a3, 14(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: sb a2, 13(a0) ; CHECK-RV64-NEXT: ntl.p1 ; CHECK-RV64-NEXT: sb a1, 12(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: sb s1, 11(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: sb s0, 10(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: sb t6, 9(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: sb t5, 8(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: sb t4, 7(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: sb t3, 6(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: sb t2, 5(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: sb t1, 4(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: sb t0, 3(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: sb a7, 2(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: sb a6, 1(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: sb a5, 0(a0) ; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -2382,54 +2382,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: .cfi_offset s0, -4 ; CHECK-RV32-NEXT: .cfi_offset s1, -8 -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 4(a1) -; CHECK-RV32-NEXT: lbu a4, 8(a1) -; CHECK-RV32-NEXT: lbu a5, 12(a1) -; CHECK-RV32-NEXT: lbu a6, 16(a1) -; CHECK-RV32-NEXT: lbu a7, 20(a1) -; CHECK-RV32-NEXT: lbu t0, 24(a1) -; CHECK-RV32-NEXT: lbu t1, 28(a1) -; CHECK-RV32-NEXT: lbu t2, 32(a1) -; CHECK-RV32-NEXT: lbu t3, 36(a1) -; CHECK-RV32-NEXT: lbu t4, 40(a1) -; CHECK-RV32-NEXT: lbu t5, 44(a1) -; CHECK-RV32-NEXT: lbu t6, 60(a1) -; CHECK-RV32-NEXT: lbu s0, 56(a1) -; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a2, 52(a1) +; CHECK-RV32-NEXT: lbu a3, 56(a1) +; CHECK-RV32-NEXT: lbu a4, 60(a1) +; CHECK-RV32-NEXT: lbu a5, 0(a1) +; CHECK-RV32-NEXT: lbu a6, 4(a1) +; CHECK-RV32-NEXT: lbu a7, 8(a1) +; CHECK-RV32-NEXT: lbu t0, 12(a1) +; CHECK-RV32-NEXT: lbu t1, 16(a1) +; CHECK-RV32-NEXT: lbu t2, 20(a1) +; CHECK-RV32-NEXT: lbu t3, 24(a1) +; CHECK-RV32-NEXT: lbu t4, 28(a1) +; CHECK-RV32-NEXT: lbu t5, 32(a1) +; CHECK-RV32-NEXT: lbu t6, 36(a1) +; CHECK-RV32-NEXT: lbu s0, 40(a1) +; CHECK-RV32-NEXT: lbu s1, 44(a1) ; CHECK-RV32-NEXT: lbu a1, 48(a1) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: sb a4, 15(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: sb a3, 14(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: sb a2, 13(a0) ; CHECK-RV32-NEXT: ntl.p1 ; CHECK-RV32-NEXT: sb a1, 12(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: sb s1, 11(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: sb s0, 10(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: sb t6, 9(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: sb t5, 8(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: sb t4, 7(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: sb t3, 6(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: sb t2, 5(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: sb t1, 4(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: sb t0, 3(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: sb a7, 2(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: sb a6, 1(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: sb a5, 0(a0) ; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -2443,44 +2443,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64C-NEXT: .cfi_offset s0, -8 ; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu t3, 104(a1) +; CHECK-RV64C-NEXT: lbu t6, 112(a1) +; CHECK-RV64C-NEXT: lbu a4, 120(a1) ; CHECK-RV64C-NEXT: lbu a6, 0(a1) ; CHECK-RV64C-NEXT: lbu a7, 8(a1) ; CHECK-RV64C-NEXT: lbu t0, 16(a1) ; CHECK-RV64C-NEXT: lbu t1, 24(a1) ; CHECK-RV64C-NEXT: lbu t2, 32(a1) -; CHECK-RV64C-NEXT: lbu t3, 40(a1) -; CHECK-RV64C-NEXT: lbu t4, 48(a1) -; CHECK-RV64C-NEXT: lbu t5, 56(a1) -; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu t4, 40(a1) +; CHECK-RV64C-NEXT: lbu t5, 48(a1) +; CHECK-RV64C-NEXT: lbu a5, 56(a1) +; CHECK-RV64C-NEXT: lbu a2, 64(a1) ; CHECK-RV64C-NEXT: lbu a3, 72(a1) -; CHECK-RV64C-NEXT: lbu a4, 80(a1) -; CHECK-RV64C-NEXT: lbu a5, 88(a1) -; CHECK-RV64C-NEXT: lbu a2, 120(a1) -; CHECK-RV64C-NEXT: lbu s0, 112(a1) -; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu s0, 80(a1) +; CHECK-RV64C-NEXT: lbu s1, 88(a1) ; CHECK-RV64C-NEXT: lbu a1, 96(a1) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: sb a4, 15(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: sb t6, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: sb t3, 13(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sb a1, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: sb s1, 11(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: sb s0, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sb a3, 9(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: sb a2, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: sb a5, 7(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: sb t5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: sb t4, 5(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sb t2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 @@ -2504,44 +2504,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32C-NEXT: .cfi_offset s0, -4 ; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu t3, 52(a1) +; CHECK-RV32C-NEXT: lbu t6, 56(a1) +; CHECK-RV32C-NEXT: lbu a4, 60(a1) ; CHECK-RV32C-NEXT: lbu a6, 0(a1) ; CHECK-RV32C-NEXT: lbu a7, 4(a1) ; CHECK-RV32C-NEXT: lbu t0, 8(a1) ; CHECK-RV32C-NEXT: lbu t1, 12(a1) ; CHECK-RV32C-NEXT: lbu t2, 16(a1) -; CHECK-RV32C-NEXT: lbu t3, 20(a1) -; CHECK-RV32C-NEXT: lbu t4, 24(a1) -; CHECK-RV32C-NEXT: lbu t5, 28(a1) -; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu t4, 20(a1) +; CHECK-RV32C-NEXT: lbu t5, 24(a1) +; CHECK-RV32C-NEXT: lbu a5, 28(a1) +; CHECK-RV32C-NEXT: lbu a2, 32(a1) ; CHECK-RV32C-NEXT: lbu a3, 36(a1) -; CHECK-RV32C-NEXT: lbu a4, 40(a1) -; CHECK-RV32C-NEXT: lbu a5, 44(a1) -; CHECK-RV32C-NEXT: lbu a2, 60(a1) -; CHECK-RV32C-NEXT: lbu s0, 56(a1) -; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu s0, 40(a1) +; CHECK-RV32C-NEXT: lbu s1, 44(a1) ; CHECK-RV32C-NEXT: lbu a1, 48(a1) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: sb a4, 15(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: sb t6, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: sb t3, 13(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sb a1, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: sb s1, 11(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: sb s0, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sb a3, 9(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: sb a2, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: sb a5, 7(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: sb t5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: sb t4, 5(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sb t2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 @@ -2577,112 +2577,112 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) { define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) { ; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: lh a2, 0(a1) -; CHECK-RV64-NEXT: lh a3, 8(a1) -; CHECK-RV64-NEXT: lh a4, 16(a1) -; CHECK-RV64-NEXT: lh a5, 24(a1) -; CHECK-RV64-NEXT: lh a6, 56(a1) -; CHECK-RV64-NEXT: lh a7, 48(a1) -; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a2, 40(a1) +; CHECK-RV64-NEXT: lh a3, 48(a1) +; CHECK-RV64-NEXT: lh a4, 56(a1) +; CHECK-RV64-NEXT: lh a5, 0(a1) +; CHECK-RV64-NEXT: lh a6, 8(a1) +; CHECK-RV64-NEXT: lh a7, 16(a1) +; CHECK-RV64-NEXT: lh t0, 24(a1) ; CHECK-RV64-NEXT: lh a1, 32(a1) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: sh a4, 14(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: sh a3, 12(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: sh a2, 10(a0) ; CHECK-RV64-NEXT: ntl.p1 ; CHECK-RV64-NEXT: sh a1, 8(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: sh t0, 6(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: sh a7, 4(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: sh a6, 2(a0) ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: sh a5, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lh a2, 0(a1) -; CHECK-RV32-NEXT: lh a3, 4(a1) -; CHECK-RV32-NEXT: lh a4, 8(a1) -; CHECK-RV32-NEXT: lh a5, 12(a1) -; CHECK-RV32-NEXT: lh a6, 28(a1) -; CHECK-RV32-NEXT: lh a7, 24(a1) -; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a2, 20(a1) +; CHECK-RV32-NEXT: lh a3, 24(a1) +; CHECK-RV32-NEXT: lh a4, 28(a1) +; CHECK-RV32-NEXT: lh a5, 0(a1) +; CHECK-RV32-NEXT: lh a6, 4(a1) +; CHECK-RV32-NEXT: lh a7, 8(a1) +; CHECK-RV32-NEXT: lh t0, 12(a1) ; CHECK-RV32-NEXT: lh a1, 16(a1) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: sh a4, 14(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: sh a3, 12(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: sh a2, 10(a0) ; CHECK-RV32-NEXT: ntl.p1 ; CHECK-RV32-NEXT: sh a1, 8(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: sh t0, 6(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: sh a7, 4(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: sh a6, 2(a0) ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: sh a5, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16: ; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a7, 40(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 56(a1) ; CHECK-RV64C-NEXT: lh a6, 0(a1) -; CHECK-RV64C-NEXT: lh a7, 8(a1) -; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh t0, 8(a1) +; CHECK-RV64C-NEXT: lh a2, 16(a1) ; CHECK-RV64C-NEXT: lh a5, 24(a1) -; CHECK-RV64C-NEXT: lh a2, 56(a1) -; CHECK-RV64C-NEXT: lh a3, 48(a1) -; CHECK-RV64C-NEXT: lh a4, 40(a1) ; CHECK-RV64C-NEXT: lh a1, 32(a1) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: sh a4, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sh a3, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: sh a7, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sh a1, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sh a5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: sh a2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: sh t0, 2(a0) ; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: sh a6, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16: ; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a7, 20(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 28(a1) ; CHECK-RV32C-NEXT: lh a6, 0(a1) -; CHECK-RV32C-NEXT: lh a7, 4(a1) -; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh t0, 4(a1) +; CHECK-RV32C-NEXT: lh a2, 8(a1) ; CHECK-RV32C-NEXT: lh a5, 12(a1) -; CHECK-RV32C-NEXT: lh a2, 28(a1) -; CHECK-RV32C-NEXT: lh a3, 24(a1) -; CHECK-RV32C-NEXT: lh a4, 20(a1) ; CHECK-RV32C-NEXT: lh a1, 16(a1) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: sh a4, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sh a3, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: sh a7, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sh a1, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sh a5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: sh a2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: sh t0, 2(a0) ; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: sh a6, 0(a0) ; CHECK-RV32C-NEXT: ret @@ -3735,54 +3735,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64-NEXT: .cfi_offset s0, -8 ; CHECK-RV64-NEXT: .cfi_offset s1, -16 -; CHECK-RV64-NEXT: lbu a2, 0(a1) -; CHECK-RV64-NEXT: lbu a3, 8(a1) -; CHECK-RV64-NEXT: lbu a4, 16(a1) -; CHECK-RV64-NEXT: lbu a5, 24(a1) -; CHECK-RV64-NEXT: lbu a6, 32(a1) -; CHECK-RV64-NEXT: lbu a7, 40(a1) -; CHECK-RV64-NEXT: lbu t0, 48(a1) -; CHECK-RV64-NEXT: lbu t1, 56(a1) -; CHECK-RV64-NEXT: lbu t2, 64(a1) -; CHECK-RV64-NEXT: lbu t3, 72(a1) -; CHECK-RV64-NEXT: lbu t4, 80(a1) -; CHECK-RV64-NEXT: lbu t5, 88(a1) -; CHECK-RV64-NEXT: lbu t6, 120(a1) -; CHECK-RV64-NEXT: lbu s0, 112(a1) -; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a2, 104(a1) +; CHECK-RV64-NEXT: lbu a3, 112(a1) +; CHECK-RV64-NEXT: lbu a4, 120(a1) +; CHECK-RV64-NEXT: lbu a5, 0(a1) +; CHECK-RV64-NEXT: lbu a6, 8(a1) +; CHECK-RV64-NEXT: lbu a7, 16(a1) +; CHECK-RV64-NEXT: lbu t0, 24(a1) +; CHECK-RV64-NEXT: lbu t1, 32(a1) +; CHECK-RV64-NEXT: lbu t2, 40(a1) +; CHECK-RV64-NEXT: lbu t3, 48(a1) +; CHECK-RV64-NEXT: lbu t4, 56(a1) +; CHECK-RV64-NEXT: lbu t5, 64(a1) +; CHECK-RV64-NEXT: lbu t6, 72(a1) +; CHECK-RV64-NEXT: lbu s0, 80(a1) +; CHECK-RV64-NEXT: lbu s1, 88(a1) ; CHECK-RV64-NEXT: lbu a1, 96(a1) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: sb a4, 15(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: sb a3, 14(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: sb a2, 13(a0) ; CHECK-RV64-NEXT: ntl.pall ; CHECK-RV64-NEXT: sb a1, 12(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: sb s1, 11(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: sb s0, 10(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: sb t6, 9(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: sb t5, 8(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: sb t4, 7(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: sb t3, 6(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: sb t2, 5(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: sb t1, 4(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: sb t0, 3(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: sb a7, 2(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: sb a6, 1(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: sb a5, 0(a0) ; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -3796,54 +3796,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: .cfi_offset s0, -4 ; CHECK-RV32-NEXT: .cfi_offset s1, -8 -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 4(a1) -; CHECK-RV32-NEXT: lbu a4, 8(a1) -; CHECK-RV32-NEXT: lbu a5, 12(a1) -; CHECK-RV32-NEXT: lbu a6, 16(a1) -; CHECK-RV32-NEXT: lbu a7, 20(a1) -; CHECK-RV32-NEXT: lbu t0, 24(a1) -; CHECK-RV32-NEXT: lbu t1, 28(a1) -; CHECK-RV32-NEXT: lbu t2, 32(a1) -; CHECK-RV32-NEXT: lbu t3, 36(a1) -; CHECK-RV32-NEXT: lbu t4, 40(a1) -; CHECK-RV32-NEXT: lbu t5, 44(a1) -; CHECK-RV32-NEXT: lbu t6, 60(a1) -; CHECK-RV32-NEXT: lbu s0, 56(a1) -; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a2, 52(a1) +; CHECK-RV32-NEXT: lbu a3, 56(a1) +; CHECK-RV32-NEXT: lbu a4, 60(a1) +; CHECK-RV32-NEXT: lbu a5, 0(a1) +; CHECK-RV32-NEXT: lbu a6, 4(a1) +; CHECK-RV32-NEXT: lbu a7, 8(a1) +; CHECK-RV32-NEXT: lbu t0, 12(a1) +; CHECK-RV32-NEXT: lbu t1, 16(a1) +; CHECK-RV32-NEXT: lbu t2, 20(a1) +; CHECK-RV32-NEXT: lbu t3, 24(a1) +; CHECK-RV32-NEXT: lbu t4, 28(a1) +; CHECK-RV32-NEXT: lbu t5, 32(a1) +; CHECK-RV32-NEXT: lbu t6, 36(a1) +; CHECK-RV32-NEXT: lbu s0, 40(a1) +; CHECK-RV32-NEXT: lbu s1, 44(a1) ; CHECK-RV32-NEXT: lbu a1, 48(a1) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: sb a4, 15(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: sb a3, 14(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: sb a2, 13(a0) ; CHECK-RV32-NEXT: ntl.pall ; CHECK-RV32-NEXT: sb a1, 12(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: sb s1, 11(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: sb s0, 10(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: sb t6, 9(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: sb t5, 8(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: sb t4, 7(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: sb t3, 6(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: sb t2, 5(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: sb t1, 4(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: sb t0, 3(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: sb a7, 2(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: sb a6, 1(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: sb a5, 0(a0) ; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -3857,44 +3857,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64C-NEXT: .cfi_offset s0, -8 ; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu t3, 104(a1) +; CHECK-RV64C-NEXT: lbu t6, 112(a1) +; CHECK-RV64C-NEXT: lbu a4, 120(a1) ; CHECK-RV64C-NEXT: lbu a6, 0(a1) ; CHECK-RV64C-NEXT: lbu a7, 8(a1) ; CHECK-RV64C-NEXT: lbu t0, 16(a1) ; CHECK-RV64C-NEXT: lbu t1, 24(a1) ; CHECK-RV64C-NEXT: lbu t2, 32(a1) -; CHECK-RV64C-NEXT: lbu t3, 40(a1) -; CHECK-RV64C-NEXT: lbu t4, 48(a1) -; CHECK-RV64C-NEXT: lbu t5, 56(a1) -; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu t4, 40(a1) +; CHECK-RV64C-NEXT: lbu t5, 48(a1) +; CHECK-RV64C-NEXT: lbu a5, 56(a1) +; CHECK-RV64C-NEXT: lbu a2, 64(a1) ; CHECK-RV64C-NEXT: lbu a3, 72(a1) -; CHECK-RV64C-NEXT: lbu a4, 80(a1) -; CHECK-RV64C-NEXT: lbu a5, 88(a1) -; CHECK-RV64C-NEXT: lbu a2, 120(a1) -; CHECK-RV64C-NEXT: lbu s0, 112(a1) -; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu s0, 80(a1) +; CHECK-RV64C-NEXT: lbu s1, 88(a1) ; CHECK-RV64C-NEXT: lbu a1, 96(a1) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: sb a4, 15(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: sb t6, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: sb t3, 13(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sb a1, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: sb s1, 11(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: sb s0, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sb a3, 9(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: sb a2, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: sb a5, 7(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: sb t5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: sb t4, 5(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sb t2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.pall @@ -3918,44 +3918,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32C-NEXT: .cfi_offset s0, -4 ; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu t3, 52(a1) +; CHECK-RV32C-NEXT: lbu t6, 56(a1) +; CHECK-RV32C-NEXT: lbu a4, 60(a1) ; CHECK-RV32C-NEXT: lbu a6, 0(a1) ; CHECK-RV32C-NEXT: lbu a7, 4(a1) ; CHECK-RV32C-NEXT: lbu t0, 8(a1) ; CHECK-RV32C-NEXT: lbu t1, 12(a1) ; CHECK-RV32C-NEXT: lbu t2, 16(a1) -; CHECK-RV32C-NEXT: lbu t3, 20(a1) -; CHECK-RV32C-NEXT: lbu t4, 24(a1) -; CHECK-RV32C-NEXT: lbu t5, 28(a1) -; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu t4, 20(a1) +; CHECK-RV32C-NEXT: lbu t5, 24(a1) +; CHECK-RV32C-NEXT: lbu a5, 28(a1) +; CHECK-RV32C-NEXT: lbu a2, 32(a1) ; CHECK-RV32C-NEXT: lbu a3, 36(a1) -; CHECK-RV32C-NEXT: lbu a4, 40(a1) -; CHECK-RV32C-NEXT: lbu a5, 44(a1) -; CHECK-RV32C-NEXT: lbu a2, 60(a1) -; CHECK-RV32C-NEXT: lbu s0, 56(a1) -; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu s0, 40(a1) +; CHECK-RV32C-NEXT: lbu s1, 44(a1) ; CHECK-RV32C-NEXT: lbu a1, 48(a1) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: sb a4, 15(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: sb t6, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: sb t3, 13(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sb a1, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: sb s1, 11(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: sb s0, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sb a3, 9(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: sb a2, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: sb a5, 7(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: sb t5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: sb t4, 5(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sb t2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.pall @@ -3991,112 +3991,112 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) { define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) { ; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: lh a2, 0(a1) -; CHECK-RV64-NEXT: lh a3, 8(a1) -; CHECK-RV64-NEXT: lh a4, 16(a1) -; CHECK-RV64-NEXT: lh a5, 24(a1) -; CHECK-RV64-NEXT: lh a6, 56(a1) -; CHECK-RV64-NEXT: lh a7, 48(a1) -; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a2, 40(a1) +; CHECK-RV64-NEXT: lh a3, 48(a1) +; CHECK-RV64-NEXT: lh a4, 56(a1) +; CHECK-RV64-NEXT: lh a5, 0(a1) +; CHECK-RV64-NEXT: lh a6, 8(a1) +; CHECK-RV64-NEXT: lh a7, 16(a1) +; CHECK-RV64-NEXT: lh t0, 24(a1) ; CHECK-RV64-NEXT: lh a1, 32(a1) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: sh a4, 14(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: sh a3, 12(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: sh a2, 10(a0) ; CHECK-RV64-NEXT: ntl.pall ; CHECK-RV64-NEXT: sh a1, 8(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: sh t0, 6(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: sh a7, 4(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: sh a6, 2(a0) ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: sh a5, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lh a2, 0(a1) -; CHECK-RV32-NEXT: lh a3, 4(a1) -; CHECK-RV32-NEXT: lh a4, 8(a1) -; CHECK-RV32-NEXT: lh a5, 12(a1) -; CHECK-RV32-NEXT: lh a6, 28(a1) -; CHECK-RV32-NEXT: lh a7, 24(a1) -; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a2, 20(a1) +; CHECK-RV32-NEXT: lh a3, 24(a1) +; CHECK-RV32-NEXT: lh a4, 28(a1) +; CHECK-RV32-NEXT: lh a5, 0(a1) +; CHECK-RV32-NEXT: lh a6, 4(a1) +; CHECK-RV32-NEXT: lh a7, 8(a1) +; CHECK-RV32-NEXT: lh t0, 12(a1) ; CHECK-RV32-NEXT: lh a1, 16(a1) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: sh a4, 14(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: sh a3, 12(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: sh a2, 10(a0) ; CHECK-RV32-NEXT: ntl.pall ; CHECK-RV32-NEXT: sh a1, 8(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: sh t0, 6(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: sh a7, 4(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: sh a6, 2(a0) ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: sh a5, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16: ; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a7, 40(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 56(a1) ; CHECK-RV64C-NEXT: lh a6, 0(a1) -; CHECK-RV64C-NEXT: lh a7, 8(a1) -; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh t0, 8(a1) +; CHECK-RV64C-NEXT: lh a2, 16(a1) ; CHECK-RV64C-NEXT: lh a5, 24(a1) -; CHECK-RV64C-NEXT: lh a2, 56(a1) -; CHECK-RV64C-NEXT: lh a3, 48(a1) -; CHECK-RV64C-NEXT: lh a4, 40(a1) ; CHECK-RV64C-NEXT: lh a1, 32(a1) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: sh a4, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sh a3, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: sh a7, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sh a1, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sh a5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: sh a2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: sh t0, 2(a0) ; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: sh a6, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16: ; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a7, 20(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 28(a1) ; CHECK-RV32C-NEXT: lh a6, 0(a1) -; CHECK-RV32C-NEXT: lh a7, 4(a1) -; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh t0, 4(a1) +; CHECK-RV32C-NEXT: lh a2, 8(a1) ; CHECK-RV32C-NEXT: lh a5, 12(a1) -; CHECK-RV32C-NEXT: lh a2, 28(a1) -; CHECK-RV32C-NEXT: lh a3, 24(a1) -; CHECK-RV32C-NEXT: lh a4, 20(a1) ; CHECK-RV32C-NEXT: lh a1, 16(a1) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: sh a4, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sh a3, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: sh a7, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sh a1, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sh a5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: sh a2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: sh t0, 2(a0) ; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: sh a6, 0(a0) ; CHECK-RV32C-NEXT: ret @@ -5149,54 +5149,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64-NEXT: .cfi_offset s0, -8 ; CHECK-RV64-NEXT: .cfi_offset s1, -16 -; CHECK-RV64-NEXT: lbu a2, 0(a1) -; CHECK-RV64-NEXT: lbu a3, 8(a1) -; CHECK-RV64-NEXT: lbu a4, 16(a1) -; CHECK-RV64-NEXT: lbu a5, 24(a1) -; CHECK-RV64-NEXT: lbu a6, 32(a1) -; CHECK-RV64-NEXT: lbu a7, 40(a1) -; CHECK-RV64-NEXT: lbu t0, 48(a1) -; CHECK-RV64-NEXT: lbu t1, 56(a1) -; CHECK-RV64-NEXT: lbu t2, 64(a1) -; CHECK-RV64-NEXT: lbu t3, 72(a1) -; CHECK-RV64-NEXT: lbu t4, 80(a1) -; CHECK-RV64-NEXT: lbu t5, 88(a1) -; CHECK-RV64-NEXT: lbu t6, 120(a1) -; CHECK-RV64-NEXT: lbu s0, 112(a1) -; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a2, 104(a1) +; CHECK-RV64-NEXT: lbu a3, 112(a1) +; CHECK-RV64-NEXT: lbu a4, 120(a1) +; CHECK-RV64-NEXT: lbu a5, 0(a1) +; CHECK-RV64-NEXT: lbu a6, 8(a1) +; CHECK-RV64-NEXT: lbu a7, 16(a1) +; CHECK-RV64-NEXT: lbu t0, 24(a1) +; CHECK-RV64-NEXT: lbu t1, 32(a1) +; CHECK-RV64-NEXT: lbu t2, 40(a1) +; CHECK-RV64-NEXT: lbu t3, 48(a1) +; CHECK-RV64-NEXT: lbu t4, 56(a1) +; CHECK-RV64-NEXT: lbu t5, 64(a1) +; CHECK-RV64-NEXT: lbu t6, 72(a1) +; CHECK-RV64-NEXT: lbu s0, 80(a1) +; CHECK-RV64-NEXT: lbu s1, 88(a1) ; CHECK-RV64-NEXT: lbu a1, 96(a1) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: sb a4, 15(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: sb a3, 14(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: sb a2, 13(a0) ; CHECK-RV64-NEXT: ntl.s1 ; CHECK-RV64-NEXT: sb a1, 12(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: sb s1, 11(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: sb s0, 10(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: sb t6, 9(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: sb t5, 8(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: sb t4, 7(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: sb t3, 6(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: sb t2, 5(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: sb t1, 4(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: sb t0, 3(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: sb a7, 2(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: sb a6, 1(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: sb a5, 0(a0) ; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -5210,54 +5210,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: .cfi_offset s0, -4 ; CHECK-RV32-NEXT: .cfi_offset s1, -8 -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 4(a1) -; CHECK-RV32-NEXT: lbu a4, 8(a1) -; CHECK-RV32-NEXT: lbu a5, 12(a1) -; CHECK-RV32-NEXT: lbu a6, 16(a1) -; CHECK-RV32-NEXT: lbu a7, 20(a1) -; CHECK-RV32-NEXT: lbu t0, 24(a1) -; CHECK-RV32-NEXT: lbu t1, 28(a1) -; CHECK-RV32-NEXT: lbu t2, 32(a1) -; CHECK-RV32-NEXT: lbu t3, 36(a1) -; CHECK-RV32-NEXT: lbu t4, 40(a1) -; CHECK-RV32-NEXT: lbu t5, 44(a1) -; CHECK-RV32-NEXT: lbu t6, 60(a1) -; CHECK-RV32-NEXT: lbu s0, 56(a1) -; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a2, 52(a1) +; CHECK-RV32-NEXT: lbu a3, 56(a1) +; CHECK-RV32-NEXT: lbu a4, 60(a1) +; CHECK-RV32-NEXT: lbu a5, 0(a1) +; CHECK-RV32-NEXT: lbu a6, 4(a1) +; CHECK-RV32-NEXT: lbu a7, 8(a1) +; CHECK-RV32-NEXT: lbu t0, 12(a1) +; CHECK-RV32-NEXT: lbu t1, 16(a1) +; CHECK-RV32-NEXT: lbu t2, 20(a1) +; CHECK-RV32-NEXT: lbu t3, 24(a1) +; CHECK-RV32-NEXT: lbu t4, 28(a1) +; CHECK-RV32-NEXT: lbu t5, 32(a1) +; CHECK-RV32-NEXT: lbu t6, 36(a1) +; CHECK-RV32-NEXT: lbu s0, 40(a1) +; CHECK-RV32-NEXT: lbu s1, 44(a1) ; CHECK-RV32-NEXT: lbu a1, 48(a1) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: sb a4, 15(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: sb a3, 14(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: sb a2, 13(a0) ; CHECK-RV32-NEXT: ntl.s1 ; CHECK-RV32-NEXT: sb a1, 12(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: sb s1, 11(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: sb s0, 10(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: sb t6, 9(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: sb t5, 8(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: sb t4, 7(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: sb t3, 6(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: sb t2, 5(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: sb t1, 4(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: sb t0, 3(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: sb a7, 2(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: sb a6, 1(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: sb a5, 0(a0) ; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -5271,44 +5271,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64C-NEXT: .cfi_offset s0, -8 ; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu t3, 104(a1) +; CHECK-RV64C-NEXT: lbu t6, 112(a1) +; CHECK-RV64C-NEXT: lbu a4, 120(a1) ; CHECK-RV64C-NEXT: lbu a6, 0(a1) ; CHECK-RV64C-NEXT: lbu a7, 8(a1) ; CHECK-RV64C-NEXT: lbu t0, 16(a1) ; CHECK-RV64C-NEXT: lbu t1, 24(a1) ; CHECK-RV64C-NEXT: lbu t2, 32(a1) -; CHECK-RV64C-NEXT: lbu t3, 40(a1) -; CHECK-RV64C-NEXT: lbu t4, 48(a1) -; CHECK-RV64C-NEXT: lbu t5, 56(a1) -; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu t4, 40(a1) +; CHECK-RV64C-NEXT: lbu t5, 48(a1) +; CHECK-RV64C-NEXT: lbu a5, 56(a1) +; CHECK-RV64C-NEXT: lbu a2, 64(a1) ; CHECK-RV64C-NEXT: lbu a3, 72(a1) -; CHECK-RV64C-NEXT: lbu a4, 80(a1) -; CHECK-RV64C-NEXT: lbu a5, 88(a1) -; CHECK-RV64C-NEXT: lbu a2, 120(a1) -; CHECK-RV64C-NEXT: lbu s0, 112(a1) -; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu s0, 80(a1) +; CHECK-RV64C-NEXT: lbu s1, 88(a1) ; CHECK-RV64C-NEXT: lbu a1, 96(a1) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: sb a4, 15(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: sb t6, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: sb t3, 13(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sb a1, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: sb s1, 11(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: sb s0, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sb a3, 9(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: sb a2, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: sb a5, 7(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: sb t5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: sb t4, 5(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sb t2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 @@ -5332,44 +5332,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32C-NEXT: .cfi_offset s0, -4 ; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu t3, 52(a1) +; CHECK-RV32C-NEXT: lbu t6, 56(a1) +; CHECK-RV32C-NEXT: lbu a4, 60(a1) ; CHECK-RV32C-NEXT: lbu a6, 0(a1) ; CHECK-RV32C-NEXT: lbu a7, 4(a1) ; CHECK-RV32C-NEXT: lbu t0, 8(a1) ; CHECK-RV32C-NEXT: lbu t1, 12(a1) ; CHECK-RV32C-NEXT: lbu t2, 16(a1) -; CHECK-RV32C-NEXT: lbu t3, 20(a1) -; CHECK-RV32C-NEXT: lbu t4, 24(a1) -; CHECK-RV32C-NEXT: lbu t5, 28(a1) -; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu t4, 20(a1) +; CHECK-RV32C-NEXT: lbu t5, 24(a1) +; CHECK-RV32C-NEXT: lbu a5, 28(a1) +; CHECK-RV32C-NEXT: lbu a2, 32(a1) ; CHECK-RV32C-NEXT: lbu a3, 36(a1) -; CHECK-RV32C-NEXT: lbu a4, 40(a1) -; CHECK-RV32C-NEXT: lbu a5, 44(a1) -; CHECK-RV32C-NEXT: lbu a2, 60(a1) -; CHECK-RV32C-NEXT: lbu s0, 56(a1) -; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu s0, 40(a1) +; CHECK-RV32C-NEXT: lbu s1, 44(a1) ; CHECK-RV32C-NEXT: lbu a1, 48(a1) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: sb a4, 15(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: sb t6, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: sb t3, 13(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sb a1, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: sb s1, 11(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: sb s0, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sb a3, 9(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: sb a2, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: sb a5, 7(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: sb t5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: sb t4, 5(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sb t2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 @@ -5405,112 +5405,112 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) { define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) { ; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: lh a2, 0(a1) -; CHECK-RV64-NEXT: lh a3, 8(a1) -; CHECK-RV64-NEXT: lh a4, 16(a1) -; CHECK-RV64-NEXT: lh a5, 24(a1) -; CHECK-RV64-NEXT: lh a6, 56(a1) -; CHECK-RV64-NEXT: lh a7, 48(a1) -; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a2, 40(a1) +; CHECK-RV64-NEXT: lh a3, 48(a1) +; CHECK-RV64-NEXT: lh a4, 56(a1) +; CHECK-RV64-NEXT: lh a5, 0(a1) +; CHECK-RV64-NEXT: lh a6, 8(a1) +; CHECK-RV64-NEXT: lh a7, 16(a1) +; CHECK-RV64-NEXT: lh t0, 24(a1) ; CHECK-RV64-NEXT: lh a1, 32(a1) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: sh a4, 14(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: sh a3, 12(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: sh a2, 10(a0) ; CHECK-RV64-NEXT: ntl.s1 ; CHECK-RV64-NEXT: sh a1, 8(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: sh t0, 6(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: sh a7, 4(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: sh a6, 2(a0) ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: sh a5, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lh a2, 0(a1) -; CHECK-RV32-NEXT: lh a3, 4(a1) -; CHECK-RV32-NEXT: lh a4, 8(a1) -; CHECK-RV32-NEXT: lh a5, 12(a1) -; CHECK-RV32-NEXT: lh a6, 28(a1) -; CHECK-RV32-NEXT: lh a7, 24(a1) -; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a2, 20(a1) +; CHECK-RV32-NEXT: lh a3, 24(a1) +; CHECK-RV32-NEXT: lh a4, 28(a1) +; CHECK-RV32-NEXT: lh a5, 0(a1) +; CHECK-RV32-NEXT: lh a6, 4(a1) +; CHECK-RV32-NEXT: lh a7, 8(a1) +; CHECK-RV32-NEXT: lh t0, 12(a1) ; CHECK-RV32-NEXT: lh a1, 16(a1) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: sh a4, 14(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: sh a3, 12(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: sh a2, 10(a0) ; CHECK-RV32-NEXT: ntl.s1 ; CHECK-RV32-NEXT: sh a1, 8(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: sh t0, 6(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: sh a7, 4(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: sh a6, 2(a0) ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: sh a5, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16: ; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a7, 40(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 56(a1) ; CHECK-RV64C-NEXT: lh a6, 0(a1) -; CHECK-RV64C-NEXT: lh a7, 8(a1) -; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh t0, 8(a1) +; CHECK-RV64C-NEXT: lh a2, 16(a1) ; CHECK-RV64C-NEXT: lh a5, 24(a1) -; CHECK-RV64C-NEXT: lh a2, 56(a1) -; CHECK-RV64C-NEXT: lh a3, 48(a1) -; CHECK-RV64C-NEXT: lh a4, 40(a1) ; CHECK-RV64C-NEXT: lh a1, 32(a1) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: sh a4, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sh a3, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: sh a7, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sh a1, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sh a5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: sh a2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: sh t0, 2(a0) ; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: sh a6, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16: ; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a7, 20(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 28(a1) ; CHECK-RV32C-NEXT: lh a6, 0(a1) -; CHECK-RV32C-NEXT: lh a7, 4(a1) -; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh t0, 4(a1) +; CHECK-RV32C-NEXT: lh a2, 8(a1) ; CHECK-RV32C-NEXT: lh a5, 12(a1) -; CHECK-RV32C-NEXT: lh a2, 28(a1) -; CHECK-RV32C-NEXT: lh a3, 24(a1) -; CHECK-RV32C-NEXT: lh a4, 20(a1) ; CHECK-RV32C-NEXT: lh a1, 16(a1) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: sh a4, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sh a3, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: sh a7, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sh a1, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sh a5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: sh a2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: sh t0, 2(a0) ; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: sh a6, 0(a0) ; CHECK-RV32C-NEXT: ret @@ -6563,54 +6563,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64-NEXT: .cfi_offset s0, -8 ; CHECK-RV64-NEXT: .cfi_offset s1, -16 -; CHECK-RV64-NEXT: lbu a2, 0(a1) -; CHECK-RV64-NEXT: lbu a3, 8(a1) -; CHECK-RV64-NEXT: lbu a4, 16(a1) -; CHECK-RV64-NEXT: lbu a5, 24(a1) -; CHECK-RV64-NEXT: lbu a6, 32(a1) -; CHECK-RV64-NEXT: lbu a7, 40(a1) -; CHECK-RV64-NEXT: lbu t0, 48(a1) -; CHECK-RV64-NEXT: lbu t1, 56(a1) -; CHECK-RV64-NEXT: lbu t2, 64(a1) -; CHECK-RV64-NEXT: lbu t3, 72(a1) -; CHECK-RV64-NEXT: lbu t4, 80(a1) -; CHECK-RV64-NEXT: lbu t5, 88(a1) -; CHECK-RV64-NEXT: lbu t6, 120(a1) -; CHECK-RV64-NEXT: lbu s0, 112(a1) -; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a2, 104(a1) +; CHECK-RV64-NEXT: lbu a3, 112(a1) +; CHECK-RV64-NEXT: lbu a4, 120(a1) +; CHECK-RV64-NEXT: lbu a5, 0(a1) +; CHECK-RV64-NEXT: lbu a6, 8(a1) +; CHECK-RV64-NEXT: lbu a7, 16(a1) +; CHECK-RV64-NEXT: lbu t0, 24(a1) +; CHECK-RV64-NEXT: lbu t1, 32(a1) +; CHECK-RV64-NEXT: lbu t2, 40(a1) +; CHECK-RV64-NEXT: lbu t3, 48(a1) +; CHECK-RV64-NEXT: lbu t4, 56(a1) +; CHECK-RV64-NEXT: lbu t5, 64(a1) +; CHECK-RV64-NEXT: lbu t6, 72(a1) +; CHECK-RV64-NEXT: lbu s0, 80(a1) +; CHECK-RV64-NEXT: lbu s1, 88(a1) ; CHECK-RV64-NEXT: lbu a1, 96(a1) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: sb a4, 15(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: sb a3, 14(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: sb a2, 13(a0) ; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: sb a1, 12(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: sb s1, 11(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: sb s0, 10(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: sb t6, 9(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: sb t5, 8(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: sb t4, 7(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: sb t3, 6(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: sb t2, 5(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: sb t1, 4(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: sb t0, 3(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: sb a7, 2(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: sb a6, 1(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: sb a5, 0(a0) ; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -6624,54 +6624,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: .cfi_offset s0, -4 ; CHECK-RV32-NEXT: .cfi_offset s1, -8 -; CHECK-RV32-NEXT: lbu a2, 0(a1) -; CHECK-RV32-NEXT: lbu a3, 4(a1) -; CHECK-RV32-NEXT: lbu a4, 8(a1) -; CHECK-RV32-NEXT: lbu a5, 12(a1) -; CHECK-RV32-NEXT: lbu a6, 16(a1) -; CHECK-RV32-NEXT: lbu a7, 20(a1) -; CHECK-RV32-NEXT: lbu t0, 24(a1) -; CHECK-RV32-NEXT: lbu t1, 28(a1) -; CHECK-RV32-NEXT: lbu t2, 32(a1) -; CHECK-RV32-NEXT: lbu t3, 36(a1) -; CHECK-RV32-NEXT: lbu t4, 40(a1) -; CHECK-RV32-NEXT: lbu t5, 44(a1) -; CHECK-RV32-NEXT: lbu t6, 60(a1) -; CHECK-RV32-NEXT: lbu s0, 56(a1) -; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a2, 52(a1) +; CHECK-RV32-NEXT: lbu a3, 56(a1) +; CHECK-RV32-NEXT: lbu a4, 60(a1) +; CHECK-RV32-NEXT: lbu a5, 0(a1) +; CHECK-RV32-NEXT: lbu a6, 4(a1) +; CHECK-RV32-NEXT: lbu a7, 8(a1) +; CHECK-RV32-NEXT: lbu t0, 12(a1) +; CHECK-RV32-NEXT: lbu t1, 16(a1) +; CHECK-RV32-NEXT: lbu t2, 20(a1) +; CHECK-RV32-NEXT: lbu t3, 24(a1) +; CHECK-RV32-NEXT: lbu t4, 28(a1) +; CHECK-RV32-NEXT: lbu t5, 32(a1) +; CHECK-RV32-NEXT: lbu t6, 36(a1) +; CHECK-RV32-NEXT: lbu s0, 40(a1) +; CHECK-RV32-NEXT: lbu s1, 44(a1) ; CHECK-RV32-NEXT: lbu a1, 48(a1) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: sb a4, 15(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: sb a3, 14(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: sb a2, 13(a0) ; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: sb a1, 12(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: sb s1, 11(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: sb s0, 10(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: sb t6, 9(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: sb t5, 8(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: sb t4, 7(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: sb t3, 6(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: sb t2, 5(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: sb t1, 4(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: sb t0, 3(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: sb a7, 2(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: sb a6, 1(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: sb a5, 0(a0) ; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -6685,44 +6685,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill ; CHECK-RV64C-NEXT: .cfi_offset s0, -8 ; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu t3, 104(a1) +; CHECK-RV64C-NEXT: lbu t6, 112(a1) +; CHECK-RV64C-NEXT: lbu a4, 120(a1) ; CHECK-RV64C-NEXT: lbu a6, 0(a1) ; CHECK-RV64C-NEXT: lbu a7, 8(a1) ; CHECK-RV64C-NEXT: lbu t0, 16(a1) ; CHECK-RV64C-NEXT: lbu t1, 24(a1) ; CHECK-RV64C-NEXT: lbu t2, 32(a1) -; CHECK-RV64C-NEXT: lbu t3, 40(a1) -; CHECK-RV64C-NEXT: lbu t4, 48(a1) -; CHECK-RV64C-NEXT: lbu t5, 56(a1) -; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu t4, 40(a1) +; CHECK-RV64C-NEXT: lbu t5, 48(a1) +; CHECK-RV64C-NEXT: lbu a5, 56(a1) +; CHECK-RV64C-NEXT: lbu a2, 64(a1) ; CHECK-RV64C-NEXT: lbu a3, 72(a1) -; CHECK-RV64C-NEXT: lbu a4, 80(a1) -; CHECK-RV64C-NEXT: lbu a5, 88(a1) -; CHECK-RV64C-NEXT: lbu a2, 120(a1) -; CHECK-RV64C-NEXT: lbu s0, 112(a1) -; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu s0, 80(a1) +; CHECK-RV64C-NEXT: lbu s1, 88(a1) ; CHECK-RV64C-NEXT: lbu a1, 96(a1) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: sb a4, 15(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: sb t6, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: sb t3, 13(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb a1, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: sb s1, 11(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: sb s0, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb a3, 9(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: sb a2, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: sb a5, 7(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: sb t5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: sb t4, 5(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sb t2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.all @@ -6746,44 +6746,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { ; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-RV32C-NEXT: .cfi_offset s0, -4 ; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu t3, 52(a1) +; CHECK-RV32C-NEXT: lbu t6, 56(a1) +; CHECK-RV32C-NEXT: lbu a4, 60(a1) ; CHECK-RV32C-NEXT: lbu a6, 0(a1) ; CHECK-RV32C-NEXT: lbu a7, 4(a1) ; CHECK-RV32C-NEXT: lbu t0, 8(a1) ; CHECK-RV32C-NEXT: lbu t1, 12(a1) ; CHECK-RV32C-NEXT: lbu t2, 16(a1) -; CHECK-RV32C-NEXT: lbu t3, 20(a1) -; CHECK-RV32C-NEXT: lbu t4, 24(a1) -; CHECK-RV32C-NEXT: lbu t5, 28(a1) -; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu t4, 20(a1) +; CHECK-RV32C-NEXT: lbu t5, 24(a1) +; CHECK-RV32C-NEXT: lbu a5, 28(a1) +; CHECK-RV32C-NEXT: lbu a2, 32(a1) ; CHECK-RV32C-NEXT: lbu a3, 36(a1) -; CHECK-RV32C-NEXT: lbu a4, 40(a1) -; CHECK-RV32C-NEXT: lbu a5, 44(a1) -; CHECK-RV32C-NEXT: lbu a2, 60(a1) -; CHECK-RV32C-NEXT: lbu s0, 56(a1) -; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu s0, 40(a1) +; CHECK-RV32C-NEXT: lbu s1, 44(a1) ; CHECK-RV32C-NEXT: lbu a1, 48(a1) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: sb a4, 15(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: sb t6, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: sb t3, 13(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb a1, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: sb s1, 11(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: sb s0, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb a3, 9(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: sb a2, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: sb a5, 7(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: sb t5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: sb t4, 5(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sb t2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.all @@ -6819,112 +6819,112 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) { define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) { ; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: lh a2, 0(a1) -; CHECK-RV64-NEXT: lh a3, 8(a1) -; CHECK-RV64-NEXT: lh a4, 16(a1) -; CHECK-RV64-NEXT: lh a5, 24(a1) -; CHECK-RV64-NEXT: lh a6, 56(a1) -; CHECK-RV64-NEXT: lh a7, 48(a1) -; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a2, 40(a1) +; CHECK-RV64-NEXT: lh a3, 48(a1) +; CHECK-RV64-NEXT: lh a4, 56(a1) +; CHECK-RV64-NEXT: lh a5, 0(a1) +; CHECK-RV64-NEXT: lh a6, 8(a1) +; CHECK-RV64-NEXT: lh a7, 16(a1) +; CHECK-RV64-NEXT: lh t0, 24(a1) ; CHECK-RV64-NEXT: lh a1, 32(a1) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: sh a4, 14(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: sh a3, 12(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: sh a2, 10(a0) ; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: sh a1, 8(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: sh t0, 6(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: sh a7, 4(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: sh a6, 2(a0) ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: sh a5, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: lh a2, 0(a1) -; CHECK-RV32-NEXT: lh a3, 4(a1) -; CHECK-RV32-NEXT: lh a4, 8(a1) -; CHECK-RV32-NEXT: lh a5, 12(a1) -; CHECK-RV32-NEXT: lh a6, 28(a1) -; CHECK-RV32-NEXT: lh a7, 24(a1) -; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a2, 20(a1) +; CHECK-RV32-NEXT: lh a3, 24(a1) +; CHECK-RV32-NEXT: lh a4, 28(a1) +; CHECK-RV32-NEXT: lh a5, 0(a1) +; CHECK-RV32-NEXT: lh a6, 4(a1) +; CHECK-RV32-NEXT: lh a7, 8(a1) +; CHECK-RV32-NEXT: lh t0, 12(a1) ; CHECK-RV32-NEXT: lh a1, 16(a1) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: sh a4, 14(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: sh a3, 12(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: sh a2, 10(a0) ; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: sh a1, 8(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: sh t0, 6(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: sh a7, 4(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: sh a6, 2(a0) ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: sh a5, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16: ; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a7, 40(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 56(a1) ; CHECK-RV64C-NEXT: lh a6, 0(a1) -; CHECK-RV64C-NEXT: lh a7, 8(a1) -; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh t0, 8(a1) +; CHECK-RV64C-NEXT: lh a2, 16(a1) ; CHECK-RV64C-NEXT: lh a5, 24(a1) -; CHECK-RV64C-NEXT: lh a2, 56(a1) -; CHECK-RV64C-NEXT: lh a3, 48(a1) -; CHECK-RV64C-NEXT: lh a4, 40(a1) ; CHECK-RV64C-NEXT: lh a1, 32(a1) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: sh a4, 14(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a3, 12(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: sh a7, 10(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a1, 8(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a5, 6(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: sh a2, 4(a0) ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: sh t0, 2(a0) ; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: sh a6, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16: ; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a7, 20(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 28(a1) ; CHECK-RV32C-NEXT: lh a6, 0(a1) -; CHECK-RV32C-NEXT: lh a7, 4(a1) -; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh t0, 4(a1) +; CHECK-RV32C-NEXT: lh a2, 8(a1) ; CHECK-RV32C-NEXT: lh a5, 12(a1) -; CHECK-RV32C-NEXT: lh a2, 28(a1) -; CHECK-RV32C-NEXT: lh a3, 24(a1) -; CHECK-RV32C-NEXT: lh a4, 20(a1) ; CHECK-RV32C-NEXT: lh a1, 16(a1) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: sh a4, 14(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a3, 12(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: sh a7, 10(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a1, 8(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a5, 6(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: sh a2, 4(a0) ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: sh t0, 2(a0) ; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: sh a6, 0(a0) ; CHECK-RV32C-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 7c3294fa81dcf..71b9d9b8c5dca 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -1241,8 +1241,8 @@ define i64 @foo2(ptr %p) { define void @PR41129(ptr %p64) { ; RV32-LABEL: PR41129: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 4(a0) ; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a2, 4(a0) ; RV32-NEXT: or a3, a1, a2 ; RV32-NEXT: beqz a3, .LBB37_2 ; RV32-NEXT: # %bb.1: # %false diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll index 21730dfcf13bc..45e5336d9c58c 100644 --- a/llvm/test/CodeGen/RISCV/pr63816.ll +++ b/llvm/test/CodeGen/RISCV/pr63816.ll @@ -4,17 +4,20 @@ define void @test(ptr %0, ptr %1) nounwind { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -80 -; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -112 +; CHECK-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs0, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs1, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs2, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs3, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs4, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs5, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs6, 8(sp) # 8-byte Folded Spill ; CHECK-NEXT: mv s0, a1 ; CHECK-NEXT: mv s1, a0 ; CHECK-NEXT: lhu a0, 12(a0) @@ -30,19 +33,19 @@ define void @test(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fmv.s fs2, fa0 ; CHECK-NEXT: lhu a0, 6(s1) +; CHECK-NEXT: lhu s2, 0(s1) +; CHECK-NEXT: lhu s3, 2(s1) +; CHECK-NEXT: lhu s4, 4(s1) ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fmv.s fs3, fa0 -; CHECK-NEXT: lhu a0, 4(s1) -; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: fmv.w.x fa0, s4 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fmv.s fs4, fa0 -; CHECK-NEXT: lhu a0, 2(s1) -; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: fmv.w.x fa0, s3 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fmv.s fs5, fa0 -; CHECK-NEXT: lhu a0, 0(s1) -; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: fmv.w.x fa0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.d.s fs6, fa0 ; CHECK-NEXT: fcvt.d.s fs5, fs5 @@ -63,17 +66,20 @@ define void @test(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: fsd fs4, 16(s0) ; CHECK-NEXT: fsd fs5, 8(s0) ; CHECK-NEXT: fsd fs6, 0(s0) -; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 80 +; CHECK-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs0, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs1, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs2, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs3, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs4, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs5, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs6, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 112 ; CHECK-NEXT: ret %V1 = load <8 x half>, ptr %0 %V2 = fpext <8 x half> %V1 to <8 x double> diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index 9ff4235746caf..9bc7bb6170a0f 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -1109,41 +1109,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV32IZCMP-NEXT: cm.push {ra, s0-s4}, -32 ; RV32IZCMP-NEXT: lui a0, %hi(var0) ; RV32IZCMP-NEXT: lw a6, %lo(var0)(a0) -; RV32IZCMP-NEXT: lw a7, %lo(var0+4)(a0) -; RV32IZCMP-NEXT: lw t0, %lo(var0+8)(a0) -; RV32IZCMP-NEXT: lw t1, %lo(var0+12)(a0) -; RV32IZCMP-NEXT: addi a5, a0, %lo(var0) -; RV32IZCMP-NEXT: lw t2, 16(a5) -; RV32IZCMP-NEXT: lw t3, 20(a5) -; RV32IZCMP-NEXT: lw t4, 24(a5) -; RV32IZCMP-NEXT: lw t5, 28(a5) -; RV32IZCMP-NEXT: lw t6, 32(a5) -; RV32IZCMP-NEXT: lw s2, 36(a5) -; RV32IZCMP-NEXT: lw s3, 40(a5) -; RV32IZCMP-NEXT: lw s4, 44(a5) -; RV32IZCMP-NEXT: lw a1, 48(a5) -; RV32IZCMP-NEXT: lw s0, 52(a5) -; RV32IZCMP-NEXT: lw s1, 68(a5) -; RV32IZCMP-NEXT: lw a2, 64(a5) -; RV32IZCMP-NEXT: lw a3, 60(a5) -; RV32IZCMP-NEXT: lw a4, 56(a5) -; RV32IZCMP-NEXT: sw s1, 68(a5) -; RV32IZCMP-NEXT: sw a2, 64(a5) -; RV32IZCMP-NEXT: sw a3, 60(a5) -; RV32IZCMP-NEXT: sw a4, 56(a5) -; RV32IZCMP-NEXT: sw s0, 52(a5) -; RV32IZCMP-NEXT: sw a1, 48(a5) -; RV32IZCMP-NEXT: sw s4, 44(a5) -; RV32IZCMP-NEXT: sw s3, 40(a5) -; RV32IZCMP-NEXT: sw s2, 36(a5) -; RV32IZCMP-NEXT: sw t6, 32(a5) -; RV32IZCMP-NEXT: sw t5, 28(a5) -; RV32IZCMP-NEXT: sw t4, 24(a5) -; RV32IZCMP-NEXT: sw t3, 20(a5) -; RV32IZCMP-NEXT: sw t2, 16(a5) -; RV32IZCMP-NEXT: sw t1, %lo(var0+12)(a0) -; RV32IZCMP-NEXT: sw t0, %lo(var0+8)(a0) -; RV32IZCMP-NEXT: sw a7, %lo(var0+4)(a0) +; RV32IZCMP-NEXT: addi a2, a0, %lo(var0) +; RV32IZCMP-NEXT: lw a7, 16(a2) +; RV32IZCMP-NEXT: lw t0, 20(a2) +; RV32IZCMP-NEXT: lw t1, 24(a2) +; RV32IZCMP-NEXT: lw t2, 28(a2) +; RV32IZCMP-NEXT: lw t3, 32(a2) +; RV32IZCMP-NEXT: lw t4, 36(a2) +; RV32IZCMP-NEXT: lw t5, 40(a2) +; RV32IZCMP-NEXT: lw t6, 44(a2) +; RV32IZCMP-NEXT: lw a3, 48(a2) +; RV32IZCMP-NEXT: lw a4, 52(a2) +; RV32IZCMP-NEXT: lw a5, 56(a2) +; RV32IZCMP-NEXT: lw a1, 60(a2) +; RV32IZCMP-NEXT: lw s0, 64(a2) +; RV32IZCMP-NEXT: lw s1, 68(a2) +; RV32IZCMP-NEXT: lw s2, %lo(var0+4)(a0) +; RV32IZCMP-NEXT: lw s3, %lo(var0+8)(a0) +; RV32IZCMP-NEXT: lw s4, %lo(var0+12)(a0) +; RV32IZCMP-NEXT: sw s1, 68(a2) +; RV32IZCMP-NEXT: sw s0, 64(a2) +; RV32IZCMP-NEXT: sw a1, 60(a2) +; RV32IZCMP-NEXT: sw a5, 56(a2) +; RV32IZCMP-NEXT: sw a4, 52(a2) +; RV32IZCMP-NEXT: sw a3, 48(a2) +; RV32IZCMP-NEXT: sw t6, 44(a2) +; RV32IZCMP-NEXT: sw t5, 40(a2) +; RV32IZCMP-NEXT: sw t4, 36(a2) +; RV32IZCMP-NEXT: sw t3, 32(a2) +; RV32IZCMP-NEXT: sw t2, 28(a2) +; RV32IZCMP-NEXT: sw t1, 24(a2) +; RV32IZCMP-NEXT: sw t0, 20(a2) +; RV32IZCMP-NEXT: sw a7, 16(a2) +; RV32IZCMP-NEXT: sw s4, %lo(var0+12)(a0) +; RV32IZCMP-NEXT: sw s3, %lo(var0+8)(a0) +; RV32IZCMP-NEXT: sw s2, %lo(var0+4)(a0) ; RV32IZCMP-NEXT: sw a6, %lo(var0)(a0) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s4}, 32 ; @@ -1152,41 +1152,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV64IZCMP-NEXT: cm.push {ra, s0-s4}, -48 ; RV64IZCMP-NEXT: lui a0, %hi(var0) ; RV64IZCMP-NEXT: lw a6, %lo(var0)(a0) -; RV64IZCMP-NEXT: lw a7, %lo(var0+4)(a0) -; RV64IZCMP-NEXT: lw t0, %lo(var0+8)(a0) -; RV64IZCMP-NEXT: lw t1, %lo(var0+12)(a0) -; RV64IZCMP-NEXT: addi a5, a0, %lo(var0) -; RV64IZCMP-NEXT: lw t2, 16(a5) -; RV64IZCMP-NEXT: lw t3, 20(a5) -; RV64IZCMP-NEXT: lw t4, 24(a5) -; RV64IZCMP-NEXT: lw t5, 28(a5) -; RV64IZCMP-NEXT: lw t6, 32(a5) -; RV64IZCMP-NEXT: lw s2, 36(a5) -; RV64IZCMP-NEXT: lw s3, 40(a5) -; RV64IZCMP-NEXT: lw s4, 44(a5) -; RV64IZCMP-NEXT: lw a1, 48(a5) -; RV64IZCMP-NEXT: lw s0, 52(a5) -; RV64IZCMP-NEXT: lw s1, 68(a5) -; RV64IZCMP-NEXT: lw a2, 64(a5) -; RV64IZCMP-NEXT: lw a3, 60(a5) -; RV64IZCMP-NEXT: lw a4, 56(a5) -; RV64IZCMP-NEXT: sw s1, 68(a5) -; RV64IZCMP-NEXT: sw a2, 64(a5) -; RV64IZCMP-NEXT: sw a3, 60(a5) -; RV64IZCMP-NEXT: sw a4, 56(a5) -; RV64IZCMP-NEXT: sw s0, 52(a5) -; RV64IZCMP-NEXT: sw a1, 48(a5) -; RV64IZCMP-NEXT: sw s4, 44(a5) -; RV64IZCMP-NEXT: sw s3, 40(a5) -; RV64IZCMP-NEXT: sw s2, 36(a5) -; RV64IZCMP-NEXT: sw t6, 32(a5) -; RV64IZCMP-NEXT: sw t5, 28(a5) -; RV64IZCMP-NEXT: sw t4, 24(a5) -; RV64IZCMP-NEXT: sw t3, 20(a5) -; RV64IZCMP-NEXT: sw t2, 16(a5) -; RV64IZCMP-NEXT: sw t1, %lo(var0+12)(a0) -; RV64IZCMP-NEXT: sw t0, %lo(var0+8)(a0) -; RV64IZCMP-NEXT: sw a7, %lo(var0+4)(a0) +; RV64IZCMP-NEXT: addi a2, a0, %lo(var0) +; RV64IZCMP-NEXT: lw a7, 16(a2) +; RV64IZCMP-NEXT: lw t0, 20(a2) +; RV64IZCMP-NEXT: lw t1, 24(a2) +; RV64IZCMP-NEXT: lw t2, 28(a2) +; RV64IZCMP-NEXT: lw t3, 32(a2) +; RV64IZCMP-NEXT: lw t4, 36(a2) +; RV64IZCMP-NEXT: lw t5, 40(a2) +; RV64IZCMP-NEXT: lw t6, 44(a2) +; RV64IZCMP-NEXT: lw a3, 48(a2) +; RV64IZCMP-NEXT: lw a4, 52(a2) +; RV64IZCMP-NEXT: lw a5, 56(a2) +; RV64IZCMP-NEXT: lw a1, 60(a2) +; RV64IZCMP-NEXT: lw s0, 64(a2) +; RV64IZCMP-NEXT: lw s1, 68(a2) +; RV64IZCMP-NEXT: lw s2, %lo(var0+4)(a0) +; RV64IZCMP-NEXT: lw s3, %lo(var0+8)(a0) +; RV64IZCMP-NEXT: lw s4, %lo(var0+12)(a0) +; RV64IZCMP-NEXT: sw s1, 68(a2) +; RV64IZCMP-NEXT: sw s0, 64(a2) +; RV64IZCMP-NEXT: sw a1, 60(a2) +; RV64IZCMP-NEXT: sw a5, 56(a2) +; RV64IZCMP-NEXT: sw a4, 52(a2) +; RV64IZCMP-NEXT: sw a3, 48(a2) +; RV64IZCMP-NEXT: sw t6, 44(a2) +; RV64IZCMP-NEXT: sw t5, 40(a2) +; RV64IZCMP-NEXT: sw t4, 36(a2) +; RV64IZCMP-NEXT: sw t3, 32(a2) +; RV64IZCMP-NEXT: sw t2, 28(a2) +; RV64IZCMP-NEXT: sw t1, 24(a2) +; RV64IZCMP-NEXT: sw t0, 20(a2) +; RV64IZCMP-NEXT: sw a7, 16(a2) +; RV64IZCMP-NEXT: sw s4, %lo(var0+12)(a0) +; RV64IZCMP-NEXT: sw s3, %lo(var0+8)(a0) +; RV64IZCMP-NEXT: sw s2, %lo(var0+4)(a0) ; RV64IZCMP-NEXT: sw a6, %lo(var0)(a0) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s4}, 48 ; @@ -1195,41 +1195,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV32IZCMP-SR-NEXT: cm.push {ra, s0-s4}, -32 ; RV32IZCMP-SR-NEXT: lui a0, %hi(var0) ; RV32IZCMP-SR-NEXT: lw a6, %lo(var0)(a0) -; RV32IZCMP-SR-NEXT: lw a7, %lo(var0+4)(a0) -; RV32IZCMP-SR-NEXT: lw t0, %lo(var0+8)(a0) -; RV32IZCMP-SR-NEXT: lw t1, %lo(var0+12)(a0) -; RV32IZCMP-SR-NEXT: addi a5, a0, %lo(var0) -; RV32IZCMP-SR-NEXT: lw t2, 16(a5) -; RV32IZCMP-SR-NEXT: lw t3, 20(a5) -; RV32IZCMP-SR-NEXT: lw t4, 24(a5) -; RV32IZCMP-SR-NEXT: lw t5, 28(a5) -; RV32IZCMP-SR-NEXT: lw t6, 32(a5) -; RV32IZCMP-SR-NEXT: lw s2, 36(a5) -; RV32IZCMP-SR-NEXT: lw s3, 40(a5) -; RV32IZCMP-SR-NEXT: lw s4, 44(a5) -; RV32IZCMP-SR-NEXT: lw a1, 48(a5) -; RV32IZCMP-SR-NEXT: lw s0, 52(a5) -; RV32IZCMP-SR-NEXT: lw s1, 68(a5) -; RV32IZCMP-SR-NEXT: lw a2, 64(a5) -; RV32IZCMP-SR-NEXT: lw a3, 60(a5) -; RV32IZCMP-SR-NEXT: lw a4, 56(a5) -; RV32IZCMP-SR-NEXT: sw s1, 68(a5) -; RV32IZCMP-SR-NEXT: sw a2, 64(a5) -; RV32IZCMP-SR-NEXT: sw a3, 60(a5) -; RV32IZCMP-SR-NEXT: sw a4, 56(a5) -; RV32IZCMP-SR-NEXT: sw s0, 52(a5) -; RV32IZCMP-SR-NEXT: sw a1, 48(a5) -; RV32IZCMP-SR-NEXT: sw s4, 44(a5) -; RV32IZCMP-SR-NEXT: sw s3, 40(a5) -; RV32IZCMP-SR-NEXT: sw s2, 36(a5) -; RV32IZCMP-SR-NEXT: sw t6, 32(a5) -; RV32IZCMP-SR-NEXT: sw t5, 28(a5) -; RV32IZCMP-SR-NEXT: sw t4, 24(a5) -; RV32IZCMP-SR-NEXT: sw t3, 20(a5) -; RV32IZCMP-SR-NEXT: sw t2, 16(a5) -; RV32IZCMP-SR-NEXT: sw t1, %lo(var0+12)(a0) -; RV32IZCMP-SR-NEXT: sw t0, %lo(var0+8)(a0) -; RV32IZCMP-SR-NEXT: sw a7, %lo(var0+4)(a0) +; RV32IZCMP-SR-NEXT: addi a2, a0, %lo(var0) +; RV32IZCMP-SR-NEXT: lw a7, 16(a2) +; RV32IZCMP-SR-NEXT: lw t0, 20(a2) +; RV32IZCMP-SR-NEXT: lw t1, 24(a2) +; RV32IZCMP-SR-NEXT: lw t2, 28(a2) +; RV32IZCMP-SR-NEXT: lw t3, 32(a2) +; RV32IZCMP-SR-NEXT: lw t4, 36(a2) +; RV32IZCMP-SR-NEXT: lw t5, 40(a2) +; RV32IZCMP-SR-NEXT: lw t6, 44(a2) +; RV32IZCMP-SR-NEXT: lw a3, 48(a2) +; RV32IZCMP-SR-NEXT: lw a4, 52(a2) +; RV32IZCMP-SR-NEXT: lw a5, 56(a2) +; RV32IZCMP-SR-NEXT: lw a1, 60(a2) +; RV32IZCMP-SR-NEXT: lw s0, 64(a2) +; RV32IZCMP-SR-NEXT: lw s1, 68(a2) +; RV32IZCMP-SR-NEXT: lw s2, %lo(var0+4)(a0) +; RV32IZCMP-SR-NEXT: lw s3, %lo(var0+8)(a0) +; RV32IZCMP-SR-NEXT: lw s4, %lo(var0+12)(a0) +; RV32IZCMP-SR-NEXT: sw s1, 68(a2) +; RV32IZCMP-SR-NEXT: sw s0, 64(a2) +; RV32IZCMP-SR-NEXT: sw a1, 60(a2) +; RV32IZCMP-SR-NEXT: sw a5, 56(a2) +; RV32IZCMP-SR-NEXT: sw a4, 52(a2) +; RV32IZCMP-SR-NEXT: sw a3, 48(a2) +; RV32IZCMP-SR-NEXT: sw t6, 44(a2) +; RV32IZCMP-SR-NEXT: sw t5, 40(a2) +; RV32IZCMP-SR-NEXT: sw t4, 36(a2) +; RV32IZCMP-SR-NEXT: sw t3, 32(a2) +; RV32IZCMP-SR-NEXT: sw t2, 28(a2) +; RV32IZCMP-SR-NEXT: sw t1, 24(a2) +; RV32IZCMP-SR-NEXT: sw t0, 20(a2) +; RV32IZCMP-SR-NEXT: sw a7, 16(a2) +; RV32IZCMP-SR-NEXT: sw s4, %lo(var0+12)(a0) +; RV32IZCMP-SR-NEXT: sw s3, %lo(var0+8)(a0) +; RV32IZCMP-SR-NEXT: sw s2, %lo(var0+4)(a0) ; RV32IZCMP-SR-NEXT: sw a6, %lo(var0)(a0) ; RV32IZCMP-SR-NEXT: cm.popret {ra, s0-s4}, 32 ; @@ -1238,41 +1238,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV64IZCMP-SR-NEXT: cm.push {ra, s0-s4}, -48 ; RV64IZCMP-SR-NEXT: lui a0, %hi(var0) ; RV64IZCMP-SR-NEXT: lw a6, %lo(var0)(a0) -; RV64IZCMP-SR-NEXT: lw a7, %lo(var0+4)(a0) -; RV64IZCMP-SR-NEXT: lw t0, %lo(var0+8)(a0) -; RV64IZCMP-SR-NEXT: lw t1, %lo(var0+12)(a0) -; RV64IZCMP-SR-NEXT: addi a5, a0, %lo(var0) -; RV64IZCMP-SR-NEXT: lw t2, 16(a5) -; RV64IZCMP-SR-NEXT: lw t3, 20(a5) -; RV64IZCMP-SR-NEXT: lw t4, 24(a5) -; RV64IZCMP-SR-NEXT: lw t5, 28(a5) -; RV64IZCMP-SR-NEXT: lw t6, 32(a5) -; RV64IZCMP-SR-NEXT: lw s2, 36(a5) -; RV64IZCMP-SR-NEXT: lw s3, 40(a5) -; RV64IZCMP-SR-NEXT: lw s4, 44(a5) -; RV64IZCMP-SR-NEXT: lw a1, 48(a5) -; RV64IZCMP-SR-NEXT: lw s0, 52(a5) -; RV64IZCMP-SR-NEXT: lw s1, 68(a5) -; RV64IZCMP-SR-NEXT: lw a2, 64(a5) -; RV64IZCMP-SR-NEXT: lw a3, 60(a5) -; RV64IZCMP-SR-NEXT: lw a4, 56(a5) -; RV64IZCMP-SR-NEXT: sw s1, 68(a5) -; RV64IZCMP-SR-NEXT: sw a2, 64(a5) -; RV64IZCMP-SR-NEXT: sw a3, 60(a5) -; RV64IZCMP-SR-NEXT: sw a4, 56(a5) -; RV64IZCMP-SR-NEXT: sw s0, 52(a5) -; RV64IZCMP-SR-NEXT: sw a1, 48(a5) -; RV64IZCMP-SR-NEXT: sw s4, 44(a5) -; RV64IZCMP-SR-NEXT: sw s3, 40(a5) -; RV64IZCMP-SR-NEXT: sw s2, 36(a5) -; RV64IZCMP-SR-NEXT: sw t6, 32(a5) -; RV64IZCMP-SR-NEXT: sw t5, 28(a5) -; RV64IZCMP-SR-NEXT: sw t4, 24(a5) -; RV64IZCMP-SR-NEXT: sw t3, 20(a5) -; RV64IZCMP-SR-NEXT: sw t2, 16(a5) -; RV64IZCMP-SR-NEXT: sw t1, %lo(var0+12)(a0) -; RV64IZCMP-SR-NEXT: sw t0, %lo(var0+8)(a0) -; RV64IZCMP-SR-NEXT: sw a7, %lo(var0+4)(a0) +; RV64IZCMP-SR-NEXT: addi a2, a0, %lo(var0) +; RV64IZCMP-SR-NEXT: lw a7, 16(a2) +; RV64IZCMP-SR-NEXT: lw t0, 20(a2) +; RV64IZCMP-SR-NEXT: lw t1, 24(a2) +; RV64IZCMP-SR-NEXT: lw t2, 28(a2) +; RV64IZCMP-SR-NEXT: lw t3, 32(a2) +; RV64IZCMP-SR-NEXT: lw t4, 36(a2) +; RV64IZCMP-SR-NEXT: lw t5, 40(a2) +; RV64IZCMP-SR-NEXT: lw t6, 44(a2) +; RV64IZCMP-SR-NEXT: lw a3, 48(a2) +; RV64IZCMP-SR-NEXT: lw a4, 52(a2) +; RV64IZCMP-SR-NEXT: lw a5, 56(a2) +; RV64IZCMP-SR-NEXT: lw a1, 60(a2) +; RV64IZCMP-SR-NEXT: lw s0, 64(a2) +; RV64IZCMP-SR-NEXT: lw s1, 68(a2) +; RV64IZCMP-SR-NEXT: lw s2, %lo(var0+4)(a0) +; RV64IZCMP-SR-NEXT: lw s3, %lo(var0+8)(a0) +; RV64IZCMP-SR-NEXT: lw s4, %lo(var0+12)(a0) +; RV64IZCMP-SR-NEXT: sw s1, 68(a2) +; RV64IZCMP-SR-NEXT: sw s0, 64(a2) +; RV64IZCMP-SR-NEXT: sw a1, 60(a2) +; RV64IZCMP-SR-NEXT: sw a5, 56(a2) +; RV64IZCMP-SR-NEXT: sw a4, 52(a2) +; RV64IZCMP-SR-NEXT: sw a3, 48(a2) +; RV64IZCMP-SR-NEXT: sw t6, 44(a2) +; RV64IZCMP-SR-NEXT: sw t5, 40(a2) +; RV64IZCMP-SR-NEXT: sw t4, 36(a2) +; RV64IZCMP-SR-NEXT: sw t3, 32(a2) +; RV64IZCMP-SR-NEXT: sw t2, 28(a2) +; RV64IZCMP-SR-NEXT: sw t1, 24(a2) +; RV64IZCMP-SR-NEXT: sw t0, 20(a2) +; RV64IZCMP-SR-NEXT: sw a7, 16(a2) +; RV64IZCMP-SR-NEXT: sw s4, %lo(var0+12)(a0) +; RV64IZCMP-SR-NEXT: sw s3, %lo(var0+8)(a0) +; RV64IZCMP-SR-NEXT: sw s2, %lo(var0+4)(a0) ; RV64IZCMP-SR-NEXT: sw a6, %lo(var0)(a0) ; RV64IZCMP-SR-NEXT: cm.popret {ra, s0-s4}, 48 ; @@ -1286,41 +1286,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a0, %hi(var0) ; RV32I-NEXT: lw a1, %lo(var0)(a0) -; RV32I-NEXT: lw a2, %lo(var0+4)(a0) -; RV32I-NEXT: lw a3, %lo(var0+8)(a0) -; RV32I-NEXT: lw a4, %lo(var0+12)(a0) -; RV32I-NEXT: addi a5, a0, %lo(var0) -; RV32I-NEXT: lw a6, 16(a5) -; RV32I-NEXT: lw a7, 20(a5) -; RV32I-NEXT: lw t0, 24(a5) -; RV32I-NEXT: lw t1, 28(a5) -; RV32I-NEXT: lw t2, 32(a5) -; RV32I-NEXT: lw t3, 36(a5) -; RV32I-NEXT: lw t4, 40(a5) -; RV32I-NEXT: lw t5, 44(a5) -; RV32I-NEXT: lw t6, 48(a5) -; RV32I-NEXT: lw s0, 52(a5) -; RV32I-NEXT: lw s1, 68(a5) -; RV32I-NEXT: lw s2, 64(a5) -; RV32I-NEXT: lw s3, 60(a5) -; RV32I-NEXT: lw s4, 56(a5) -; RV32I-NEXT: sw s1, 68(a5) -; RV32I-NEXT: sw s2, 64(a5) -; RV32I-NEXT: sw s3, 60(a5) -; RV32I-NEXT: sw s4, 56(a5) -; RV32I-NEXT: sw s0, 52(a5) -; RV32I-NEXT: sw t6, 48(a5) -; RV32I-NEXT: sw t5, 44(a5) -; RV32I-NEXT: sw t4, 40(a5) -; RV32I-NEXT: sw t3, 36(a5) -; RV32I-NEXT: sw t2, 32(a5) -; RV32I-NEXT: sw t1, 28(a5) -; RV32I-NEXT: sw t0, 24(a5) -; RV32I-NEXT: sw a7, 20(a5) -; RV32I-NEXT: sw a6, 16(a5) -; RV32I-NEXT: sw a4, %lo(var0+12)(a0) -; RV32I-NEXT: sw a3, %lo(var0+8)(a0) -; RV32I-NEXT: sw a2, %lo(var0+4)(a0) +; RV32I-NEXT: addi a2, a0, %lo(var0) +; RV32I-NEXT: lw a3, 16(a2) +; RV32I-NEXT: lw a4, 20(a2) +; RV32I-NEXT: lw a5, 24(a2) +; RV32I-NEXT: lw a6, 28(a2) +; RV32I-NEXT: lw a7, 32(a2) +; RV32I-NEXT: lw t0, 36(a2) +; RV32I-NEXT: lw t1, 40(a2) +; RV32I-NEXT: lw t2, 44(a2) +; RV32I-NEXT: lw t3, 48(a2) +; RV32I-NEXT: lw t4, 52(a2) +; RV32I-NEXT: lw t5, 56(a2) +; RV32I-NEXT: lw t6, 60(a2) +; RV32I-NEXT: lw s0, 64(a2) +; RV32I-NEXT: lw s1, 68(a2) +; RV32I-NEXT: lw s2, %lo(var0+4)(a0) +; RV32I-NEXT: lw s3, %lo(var0+8)(a0) +; RV32I-NEXT: lw s4, %lo(var0+12)(a0) +; RV32I-NEXT: sw s1, 68(a2) +; RV32I-NEXT: sw s0, 64(a2) +; RV32I-NEXT: sw t6, 60(a2) +; RV32I-NEXT: sw t5, 56(a2) +; RV32I-NEXT: sw t4, 52(a2) +; RV32I-NEXT: sw t3, 48(a2) +; RV32I-NEXT: sw t2, 44(a2) +; RV32I-NEXT: sw t1, 40(a2) +; RV32I-NEXT: sw t0, 36(a2) +; RV32I-NEXT: sw a7, 32(a2) +; RV32I-NEXT: sw a6, 28(a2) +; RV32I-NEXT: sw a5, 24(a2) +; RV32I-NEXT: sw a4, 20(a2) +; RV32I-NEXT: sw a3, 16(a2) +; RV32I-NEXT: sw s4, %lo(var0+12)(a0) +; RV32I-NEXT: sw s3, %lo(var0+8)(a0) +; RV32I-NEXT: sw s2, %lo(var0+4)(a0) ; RV32I-NEXT: sw a1, %lo(var0)(a0) ; RV32I-NEXT: lw s0, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 24(sp) # 4-byte Folded Reload @@ -1340,41 +1340,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind { ; RV64I-NEXT: sd s4, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lui a0, %hi(var0) ; RV64I-NEXT: lw a1, %lo(var0)(a0) -; RV64I-NEXT: lw a2, %lo(var0+4)(a0) -; RV64I-NEXT: lw a3, %lo(var0+8)(a0) -; RV64I-NEXT: lw a4, %lo(var0+12)(a0) -; RV64I-NEXT: addi a5, a0, %lo(var0) -; RV64I-NEXT: lw a6, 16(a5) -; RV64I-NEXT: lw a7, 20(a5) -; RV64I-NEXT: lw t0, 24(a5) -; RV64I-NEXT: lw t1, 28(a5) -; RV64I-NEXT: lw t2, 32(a5) -; RV64I-NEXT: lw t3, 36(a5) -; RV64I-NEXT: lw t4, 40(a5) -; RV64I-NEXT: lw t5, 44(a5) -; RV64I-NEXT: lw t6, 48(a5) -; RV64I-NEXT: lw s0, 52(a5) -; RV64I-NEXT: lw s1, 68(a5) -; RV64I-NEXT: lw s2, 64(a5) -; RV64I-NEXT: lw s3, 60(a5) -; RV64I-NEXT: lw s4, 56(a5) -; RV64I-NEXT: sw s1, 68(a5) -; RV64I-NEXT: sw s2, 64(a5) -; RV64I-NEXT: sw s3, 60(a5) -; RV64I-NEXT: sw s4, 56(a5) -; RV64I-NEXT: sw s0, 52(a5) -; RV64I-NEXT: sw t6, 48(a5) -; RV64I-NEXT: sw t5, 44(a5) -; RV64I-NEXT: sw t4, 40(a5) -; RV64I-NEXT: sw t3, 36(a5) -; RV64I-NEXT: sw t2, 32(a5) -; RV64I-NEXT: sw t1, 28(a5) -; RV64I-NEXT: sw t0, 24(a5) -; RV64I-NEXT: sw a7, 20(a5) -; RV64I-NEXT: sw a6, 16(a5) -; RV64I-NEXT: sw a4, %lo(var0+12)(a0) -; RV64I-NEXT: sw a3, %lo(var0+8)(a0) -; RV64I-NEXT: sw a2, %lo(var0+4)(a0) +; RV64I-NEXT: addi a2, a0, %lo(var0) +; RV64I-NEXT: lw a3, 16(a2) +; RV64I-NEXT: lw a4, 20(a2) +; RV64I-NEXT: lw a5, 24(a2) +; RV64I-NEXT: lw a6, 28(a2) +; RV64I-NEXT: lw a7, 32(a2) +; RV64I-NEXT: lw t0, 36(a2) +; RV64I-NEXT: lw t1, 40(a2) +; RV64I-NEXT: lw t2, 44(a2) +; RV64I-NEXT: lw t3, 48(a2) +; RV64I-NEXT: lw t4, 52(a2) +; RV64I-NEXT: lw t5, 56(a2) +; RV64I-NEXT: lw t6, 60(a2) +; RV64I-NEXT: lw s0, 64(a2) +; RV64I-NEXT: lw s1, 68(a2) +; RV64I-NEXT: lw s2, %lo(var0+4)(a0) +; RV64I-NEXT: lw s3, %lo(var0+8)(a0) +; RV64I-NEXT: lw s4, %lo(var0+12)(a0) +; RV64I-NEXT: sw s1, 68(a2) +; RV64I-NEXT: sw s0, 64(a2) +; RV64I-NEXT: sw t6, 60(a2) +; RV64I-NEXT: sw t5, 56(a2) +; RV64I-NEXT: sw t4, 52(a2) +; RV64I-NEXT: sw t3, 48(a2) +; RV64I-NEXT: sw t2, 44(a2) +; RV64I-NEXT: sw t1, 40(a2) +; RV64I-NEXT: sw t0, 36(a2) +; RV64I-NEXT: sw a7, 32(a2) +; RV64I-NEXT: sw a6, 28(a2) +; RV64I-NEXT: sw a5, 24(a2) +; RV64I-NEXT: sw a4, 20(a2) +; RV64I-NEXT: sw a3, 16(a2) +; RV64I-NEXT: sw s4, %lo(var0+12)(a0) +; RV64I-NEXT: sw s3, %lo(var0+8)(a0) +; RV64I-NEXT: sw s2, %lo(var0+4)(a0) ; RV64I-NEXT: sw a1, %lo(var0)(a0) ; RV64I-NEXT: ld s0, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 32(sp) # 8-byte Folded Reload @@ -1813,84 +1813,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: lui a5, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: addi a2, a5, %lo(var_test_irq) +; RV32IZCMP-NEXT: lw a0, 16(a2) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: lw a0, 20(a2) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: lw a0, 24(a2) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32IZCMP-NEXT: lw a0, 16(a5) +; RV32IZCMP-NEXT: lw a0, 28(a2) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, 20(a5) +; RV32IZCMP-NEXT: lw a0, 32(a2) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw t4, 24(a5) -; RV32IZCMP-NEXT: lw t5, 28(a5) -; RV32IZCMP-NEXT: lw t6, 32(a5) -; RV32IZCMP-NEXT: lw s2, 36(a5) -; RV32IZCMP-NEXT: lw s3, 40(a5) -; RV32IZCMP-NEXT: lw s4, 44(a5) -; RV32IZCMP-NEXT: lw s5, 48(a5) -; RV32IZCMP-NEXT: lw s6, 52(a5) -; RV32IZCMP-NEXT: lw s7, 56(a5) -; RV32IZCMP-NEXT: lw s8, 60(a5) -; RV32IZCMP-NEXT: lw s9, 64(a5) -; RV32IZCMP-NEXT: lw s10, 68(a5) -; RV32IZCMP-NEXT: lw s11, 72(a5) -; RV32IZCMP-NEXT: lw ra, 76(a5) -; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw t0, 96(a5) -; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) -; RV32IZCMP-NEXT: lw a4, 108(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a1, 120(a5) -; RV32IZCMP-NEXT: lw a2, 116(a5) -; RV32IZCMP-NEXT: lw a3, 112(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a1, 120(a5) -; RV32IZCMP-NEXT: sw a2, 116(a5) -; RV32IZCMP-NEXT: sw a3, 112(a5) -; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) -; RV32IZCMP-NEXT: sw s0, 100(a5) -; RV32IZCMP-NEXT: sw t0, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) -; RV32IZCMP-NEXT: sw s1, 80(a5) -; RV32IZCMP-NEXT: sw ra, 76(a5) -; RV32IZCMP-NEXT: sw s11, 72(a5) -; RV32IZCMP-NEXT: sw s10, 68(a5) -; RV32IZCMP-NEXT: sw s9, 64(a5) -; RV32IZCMP-NEXT: sw s8, 60(a5) -; RV32IZCMP-NEXT: sw s7, 56(a5) -; RV32IZCMP-NEXT: sw s6, 52(a5) -; RV32IZCMP-NEXT: sw s5, 48(a5) -; RV32IZCMP-NEXT: sw s4, 44(a5) -; RV32IZCMP-NEXT: sw s3, 40(a5) -; RV32IZCMP-NEXT: sw s2, 36(a5) -; RV32IZCMP-NEXT: sw t6, 32(a5) -; RV32IZCMP-NEXT: sw t5, 28(a5) -; RV32IZCMP-NEXT: sw t4, 24(a5) +; RV32IZCMP-NEXT: lw t4, 36(a2) +; RV32IZCMP-NEXT: lw t5, 40(a2) +; RV32IZCMP-NEXT: lw t6, 44(a2) +; RV32IZCMP-NEXT: lw s2, 48(a2) +; RV32IZCMP-NEXT: lw s3, 52(a2) +; RV32IZCMP-NEXT: lw s4, 56(a2) +; RV32IZCMP-NEXT: lw s5, 60(a2) +; RV32IZCMP-NEXT: lw s6, 64(a2) +; RV32IZCMP-NEXT: lw s7, 68(a2) +; RV32IZCMP-NEXT: lw s8, 72(a2) +; RV32IZCMP-NEXT: lw s9, 76(a2) +; RV32IZCMP-NEXT: lw s10, 80(a2) +; RV32IZCMP-NEXT: lw s11, 84(a2) +; RV32IZCMP-NEXT: lw ra, 88(a2) +; RV32IZCMP-NEXT: lw s1, 92(a2) +; RV32IZCMP-NEXT: lw t0, 96(a2) +; RV32IZCMP-NEXT: lw a7, 100(a2) +; RV32IZCMP-NEXT: lw a6, 104(a2) +; RV32IZCMP-NEXT: lw a4, 108(a2) +; RV32IZCMP-NEXT: lw s0, 112(a2) +; RV32IZCMP-NEXT: lw a3, 116(a2) +; RV32IZCMP-NEXT: lw a1, 120(a2) +; RV32IZCMP-NEXT: lw a0, 124(a2) +; RV32IZCMP-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV32IZCMP-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-NEXT: sw a0, 124(a2) +; RV32IZCMP-NEXT: sw a1, 120(a2) +; RV32IZCMP-NEXT: sw a3, 116(a2) +; RV32IZCMP-NEXT: sw s0, 112(a2) +; RV32IZCMP-NEXT: sw a4, 108(a2) +; RV32IZCMP-NEXT: sw a6, 104(a2) +; RV32IZCMP-NEXT: sw a7, 100(a2) +; RV32IZCMP-NEXT: sw t0, 96(a2) +; RV32IZCMP-NEXT: sw s1, 92(a2) +; RV32IZCMP-NEXT: sw ra, 88(a2) +; RV32IZCMP-NEXT: sw s11, 84(a2) +; RV32IZCMP-NEXT: sw s10, 80(a2) +; RV32IZCMP-NEXT: sw s9, 76(a2) +; RV32IZCMP-NEXT: sw s8, 72(a2) +; RV32IZCMP-NEXT: sw s7, 68(a2) +; RV32IZCMP-NEXT: sw s6, 64(a2) +; RV32IZCMP-NEXT: sw s5, 60(a2) +; RV32IZCMP-NEXT: sw s4, 56(a2) +; RV32IZCMP-NEXT: sw s3, 52(a2) +; RV32IZCMP-NEXT: sw s2, 48(a2) +; RV32IZCMP-NEXT: sw t6, 44(a2) +; RV32IZCMP-NEXT: sw t5, 40(a2) +; RV32IZCMP-NEXT: sw t4, 36(a2) ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 20(a5) +; RV32IZCMP-NEXT: sw a0, 32(a2) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 16(a5) +; RV32IZCMP-NEXT: sw a0, 28(a2) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: sw a0, 24(a2) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: sw a0, 20(a2) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: sw a0, 16(a2) +; RV32IZCMP-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV32IZCMP-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -1929,84 +1929,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: lui a5, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: addi a2, a5, %lo(var_test_irq) +; RV64IZCMP-NEXT: lw a0, 16(a2) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: lw a0, 20(a2) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: lw a0, 24(a2) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64IZCMP-NEXT: lw a0, 16(a5) +; RV64IZCMP-NEXT: lw a0, 28(a2) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, 20(a5) +; RV64IZCMP-NEXT: lw a0, 32(a2) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw t4, 24(a5) -; RV64IZCMP-NEXT: lw t5, 28(a5) -; RV64IZCMP-NEXT: lw t6, 32(a5) -; RV64IZCMP-NEXT: lw s2, 36(a5) -; RV64IZCMP-NEXT: lw s3, 40(a5) -; RV64IZCMP-NEXT: lw s4, 44(a5) -; RV64IZCMP-NEXT: lw s5, 48(a5) -; RV64IZCMP-NEXT: lw s6, 52(a5) -; RV64IZCMP-NEXT: lw s7, 56(a5) -; RV64IZCMP-NEXT: lw s8, 60(a5) -; RV64IZCMP-NEXT: lw s9, 64(a5) -; RV64IZCMP-NEXT: lw s10, 68(a5) -; RV64IZCMP-NEXT: lw s11, 72(a5) -; RV64IZCMP-NEXT: lw ra, 76(a5) -; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw t0, 96(a5) -; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) -; RV64IZCMP-NEXT: lw a4, 108(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a1, 120(a5) -; RV64IZCMP-NEXT: lw a2, 116(a5) -; RV64IZCMP-NEXT: lw a3, 112(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a1, 120(a5) -; RV64IZCMP-NEXT: sw a2, 116(a5) -; RV64IZCMP-NEXT: sw a3, 112(a5) -; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) -; RV64IZCMP-NEXT: sw s0, 100(a5) -; RV64IZCMP-NEXT: sw t0, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) -; RV64IZCMP-NEXT: sw s1, 80(a5) -; RV64IZCMP-NEXT: sw ra, 76(a5) -; RV64IZCMP-NEXT: sw s11, 72(a5) -; RV64IZCMP-NEXT: sw s10, 68(a5) -; RV64IZCMP-NEXT: sw s9, 64(a5) -; RV64IZCMP-NEXT: sw s8, 60(a5) -; RV64IZCMP-NEXT: sw s7, 56(a5) -; RV64IZCMP-NEXT: sw s6, 52(a5) -; RV64IZCMP-NEXT: sw s5, 48(a5) -; RV64IZCMP-NEXT: sw s4, 44(a5) -; RV64IZCMP-NEXT: sw s3, 40(a5) -; RV64IZCMP-NEXT: sw s2, 36(a5) -; RV64IZCMP-NEXT: sw t6, 32(a5) -; RV64IZCMP-NEXT: sw t5, 28(a5) -; RV64IZCMP-NEXT: sw t4, 24(a5) +; RV64IZCMP-NEXT: lw t4, 36(a2) +; RV64IZCMP-NEXT: lw t5, 40(a2) +; RV64IZCMP-NEXT: lw t6, 44(a2) +; RV64IZCMP-NEXT: lw s2, 48(a2) +; RV64IZCMP-NEXT: lw s3, 52(a2) +; RV64IZCMP-NEXT: lw s4, 56(a2) +; RV64IZCMP-NEXT: lw s5, 60(a2) +; RV64IZCMP-NEXT: lw s6, 64(a2) +; RV64IZCMP-NEXT: lw s7, 68(a2) +; RV64IZCMP-NEXT: lw s8, 72(a2) +; RV64IZCMP-NEXT: lw s9, 76(a2) +; RV64IZCMP-NEXT: lw s10, 80(a2) +; RV64IZCMP-NEXT: lw s11, 84(a2) +; RV64IZCMP-NEXT: lw ra, 88(a2) +; RV64IZCMP-NEXT: lw s1, 92(a2) +; RV64IZCMP-NEXT: lw t0, 96(a2) +; RV64IZCMP-NEXT: lw a7, 100(a2) +; RV64IZCMP-NEXT: lw a6, 104(a2) +; RV64IZCMP-NEXT: lw a4, 108(a2) +; RV64IZCMP-NEXT: lw s0, 112(a2) +; RV64IZCMP-NEXT: lw a3, 116(a2) +; RV64IZCMP-NEXT: lw a1, 120(a2) +; RV64IZCMP-NEXT: lw a0, 124(a2) +; RV64IZCMP-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV64IZCMP-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-NEXT: sw a0, 124(a2) +; RV64IZCMP-NEXT: sw a1, 120(a2) +; RV64IZCMP-NEXT: sw a3, 116(a2) +; RV64IZCMP-NEXT: sw s0, 112(a2) +; RV64IZCMP-NEXT: sw a4, 108(a2) +; RV64IZCMP-NEXT: sw a6, 104(a2) +; RV64IZCMP-NEXT: sw a7, 100(a2) +; RV64IZCMP-NEXT: sw t0, 96(a2) +; RV64IZCMP-NEXT: sw s1, 92(a2) +; RV64IZCMP-NEXT: sw ra, 88(a2) +; RV64IZCMP-NEXT: sw s11, 84(a2) +; RV64IZCMP-NEXT: sw s10, 80(a2) +; RV64IZCMP-NEXT: sw s9, 76(a2) +; RV64IZCMP-NEXT: sw s8, 72(a2) +; RV64IZCMP-NEXT: sw s7, 68(a2) +; RV64IZCMP-NEXT: sw s6, 64(a2) +; RV64IZCMP-NEXT: sw s5, 60(a2) +; RV64IZCMP-NEXT: sw s4, 56(a2) +; RV64IZCMP-NEXT: sw s3, 52(a2) +; RV64IZCMP-NEXT: sw s2, 48(a2) +; RV64IZCMP-NEXT: sw t6, 44(a2) +; RV64IZCMP-NEXT: sw t5, 40(a2) +; RV64IZCMP-NEXT: sw t4, 36(a2) ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 20(a5) +; RV64IZCMP-NEXT: sw a0, 32(a2) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 16(a5) +; RV64IZCMP-NEXT: sw a0, 28(a2) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: sw a0, 24(a2) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: sw a0, 20(a2) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: sw a0, 16(a2) +; RV64IZCMP-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2045,84 +2045,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: lui a5, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-SR-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: addi a2, a5, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, 16(a2) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: lw a0, 20(a2) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: lw a0, 24(a2) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, 16(a5) +; RV32IZCMP-SR-NEXT: lw a0, 28(a2) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, 20(a5) +; RV32IZCMP-SR-NEXT: lw a0, 32(a2) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw t4, 24(a5) -; RV32IZCMP-SR-NEXT: lw t5, 28(a5) -; RV32IZCMP-SR-NEXT: lw t6, 32(a5) -; RV32IZCMP-SR-NEXT: lw s2, 36(a5) -; RV32IZCMP-SR-NEXT: lw s3, 40(a5) -; RV32IZCMP-SR-NEXT: lw s4, 44(a5) -; RV32IZCMP-SR-NEXT: lw s5, 48(a5) -; RV32IZCMP-SR-NEXT: lw s6, 52(a5) -; RV32IZCMP-SR-NEXT: lw s7, 56(a5) -; RV32IZCMP-SR-NEXT: lw s8, 60(a5) -; RV32IZCMP-SR-NEXT: lw s9, 64(a5) -; RV32IZCMP-SR-NEXT: lw s10, 68(a5) -; RV32IZCMP-SR-NEXT: lw s11, 72(a5) -; RV32IZCMP-SR-NEXT: lw ra, 76(a5) -; RV32IZCMP-SR-NEXT: lw s1, 80(a5) -; RV32IZCMP-SR-NEXT: lw t3, 84(a5) -; RV32IZCMP-SR-NEXT: lw t2, 88(a5) -; RV32IZCMP-SR-NEXT: lw t1, 92(a5) -; RV32IZCMP-SR-NEXT: lw t0, 96(a5) -; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a7, 104(a5) -; RV32IZCMP-SR-NEXT: lw a4, 108(a5) -; RV32IZCMP-SR-NEXT: lw a0, 124(a5) -; RV32IZCMP-SR-NEXT: lw a1, 120(a5) -; RV32IZCMP-SR-NEXT: lw a2, 116(a5) -; RV32IZCMP-SR-NEXT: lw a3, 112(a5) -; RV32IZCMP-SR-NEXT: sw a0, 124(a5) -; RV32IZCMP-SR-NEXT: sw a1, 120(a5) -; RV32IZCMP-SR-NEXT: sw a2, 116(a5) -; RV32IZCMP-SR-NEXT: sw a3, 112(a5) -; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a7, 104(a5) -; RV32IZCMP-SR-NEXT: sw s0, 100(a5) -; RV32IZCMP-SR-NEXT: sw t0, 96(a5) -; RV32IZCMP-SR-NEXT: sw t1, 92(a5) -; RV32IZCMP-SR-NEXT: sw t2, 88(a5) -; RV32IZCMP-SR-NEXT: sw t3, 84(a5) -; RV32IZCMP-SR-NEXT: sw s1, 80(a5) -; RV32IZCMP-SR-NEXT: sw ra, 76(a5) -; RV32IZCMP-SR-NEXT: sw s11, 72(a5) -; RV32IZCMP-SR-NEXT: sw s10, 68(a5) -; RV32IZCMP-SR-NEXT: sw s9, 64(a5) -; RV32IZCMP-SR-NEXT: sw s8, 60(a5) -; RV32IZCMP-SR-NEXT: sw s7, 56(a5) -; RV32IZCMP-SR-NEXT: sw s6, 52(a5) -; RV32IZCMP-SR-NEXT: sw s5, 48(a5) -; RV32IZCMP-SR-NEXT: sw s4, 44(a5) -; RV32IZCMP-SR-NEXT: sw s3, 40(a5) -; RV32IZCMP-SR-NEXT: sw s2, 36(a5) -; RV32IZCMP-SR-NEXT: sw t6, 32(a5) -; RV32IZCMP-SR-NEXT: sw t5, 28(a5) -; RV32IZCMP-SR-NEXT: sw t4, 24(a5) +; RV32IZCMP-SR-NEXT: lw t4, 36(a2) +; RV32IZCMP-SR-NEXT: lw t5, 40(a2) +; RV32IZCMP-SR-NEXT: lw t6, 44(a2) +; RV32IZCMP-SR-NEXT: lw s2, 48(a2) +; RV32IZCMP-SR-NEXT: lw s3, 52(a2) +; RV32IZCMP-SR-NEXT: lw s4, 56(a2) +; RV32IZCMP-SR-NEXT: lw s5, 60(a2) +; RV32IZCMP-SR-NEXT: lw s6, 64(a2) +; RV32IZCMP-SR-NEXT: lw s7, 68(a2) +; RV32IZCMP-SR-NEXT: lw s8, 72(a2) +; RV32IZCMP-SR-NEXT: lw s9, 76(a2) +; RV32IZCMP-SR-NEXT: lw s10, 80(a2) +; RV32IZCMP-SR-NEXT: lw s11, 84(a2) +; RV32IZCMP-SR-NEXT: lw ra, 88(a2) +; RV32IZCMP-SR-NEXT: lw s1, 92(a2) +; RV32IZCMP-SR-NEXT: lw t0, 96(a2) +; RV32IZCMP-SR-NEXT: lw a7, 100(a2) +; RV32IZCMP-SR-NEXT: lw a6, 104(a2) +; RV32IZCMP-SR-NEXT: lw a4, 108(a2) +; RV32IZCMP-SR-NEXT: lw s0, 112(a2) +; RV32IZCMP-SR-NEXT: lw a3, 116(a2) +; RV32IZCMP-SR-NEXT: lw a1, 120(a2) +; RV32IZCMP-SR-NEXT: lw a0, 124(a2) +; RV32IZCMP-SR-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV32IZCMP-SR-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-SR-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-SR-NEXT: sw a0, 124(a2) +; RV32IZCMP-SR-NEXT: sw a1, 120(a2) +; RV32IZCMP-SR-NEXT: sw a3, 116(a2) +; RV32IZCMP-SR-NEXT: sw s0, 112(a2) +; RV32IZCMP-SR-NEXT: sw a4, 108(a2) +; RV32IZCMP-SR-NEXT: sw a6, 104(a2) +; RV32IZCMP-SR-NEXT: sw a7, 100(a2) +; RV32IZCMP-SR-NEXT: sw t0, 96(a2) +; RV32IZCMP-SR-NEXT: sw s1, 92(a2) +; RV32IZCMP-SR-NEXT: sw ra, 88(a2) +; RV32IZCMP-SR-NEXT: sw s11, 84(a2) +; RV32IZCMP-SR-NEXT: sw s10, 80(a2) +; RV32IZCMP-SR-NEXT: sw s9, 76(a2) +; RV32IZCMP-SR-NEXT: sw s8, 72(a2) +; RV32IZCMP-SR-NEXT: sw s7, 68(a2) +; RV32IZCMP-SR-NEXT: sw s6, 64(a2) +; RV32IZCMP-SR-NEXT: sw s5, 60(a2) +; RV32IZCMP-SR-NEXT: sw s4, 56(a2) +; RV32IZCMP-SR-NEXT: sw s3, 52(a2) +; RV32IZCMP-SR-NEXT: sw s2, 48(a2) +; RV32IZCMP-SR-NEXT: sw t6, 44(a2) +; RV32IZCMP-SR-NEXT: sw t5, 40(a2) +; RV32IZCMP-SR-NEXT: sw t4, 36(a2) ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, 20(a5) +; RV32IZCMP-SR-NEXT: sw a0, 32(a2) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, 16(a5) +; RV32IZCMP-SR-NEXT: sw a0, 28(a2) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 24(a2) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 20(a2) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 16(a2) +; RV32IZCMP-SR-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-SR-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-SR-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV32IZCMP-SR-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-SR-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -2161,84 +2161,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: lui a5, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-SR-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: addi a2, a5, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, 16(a2) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: lw a0, 20(a2) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: lw a0, 24(a2) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, 16(a5) +; RV64IZCMP-SR-NEXT: lw a0, 28(a2) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, 20(a5) +; RV64IZCMP-SR-NEXT: lw a0, 32(a2) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw t4, 24(a5) -; RV64IZCMP-SR-NEXT: lw t5, 28(a5) -; RV64IZCMP-SR-NEXT: lw t6, 32(a5) -; RV64IZCMP-SR-NEXT: lw s2, 36(a5) -; RV64IZCMP-SR-NEXT: lw s3, 40(a5) -; RV64IZCMP-SR-NEXT: lw s4, 44(a5) -; RV64IZCMP-SR-NEXT: lw s5, 48(a5) -; RV64IZCMP-SR-NEXT: lw s6, 52(a5) -; RV64IZCMP-SR-NEXT: lw s7, 56(a5) -; RV64IZCMP-SR-NEXT: lw s8, 60(a5) -; RV64IZCMP-SR-NEXT: lw s9, 64(a5) -; RV64IZCMP-SR-NEXT: lw s10, 68(a5) -; RV64IZCMP-SR-NEXT: lw s11, 72(a5) -; RV64IZCMP-SR-NEXT: lw ra, 76(a5) -; RV64IZCMP-SR-NEXT: lw s1, 80(a5) -; RV64IZCMP-SR-NEXT: lw t3, 84(a5) -; RV64IZCMP-SR-NEXT: lw t2, 88(a5) -; RV64IZCMP-SR-NEXT: lw t1, 92(a5) -; RV64IZCMP-SR-NEXT: lw t0, 96(a5) -; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a7, 104(a5) -; RV64IZCMP-SR-NEXT: lw a4, 108(a5) -; RV64IZCMP-SR-NEXT: lw a0, 124(a5) -; RV64IZCMP-SR-NEXT: lw a1, 120(a5) -; RV64IZCMP-SR-NEXT: lw a2, 116(a5) -; RV64IZCMP-SR-NEXT: lw a3, 112(a5) -; RV64IZCMP-SR-NEXT: sw a0, 124(a5) -; RV64IZCMP-SR-NEXT: sw a1, 120(a5) -; RV64IZCMP-SR-NEXT: sw a2, 116(a5) -; RV64IZCMP-SR-NEXT: sw a3, 112(a5) -; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a7, 104(a5) -; RV64IZCMP-SR-NEXT: sw s0, 100(a5) -; RV64IZCMP-SR-NEXT: sw t0, 96(a5) -; RV64IZCMP-SR-NEXT: sw t1, 92(a5) -; RV64IZCMP-SR-NEXT: sw t2, 88(a5) -; RV64IZCMP-SR-NEXT: sw t3, 84(a5) -; RV64IZCMP-SR-NEXT: sw s1, 80(a5) -; RV64IZCMP-SR-NEXT: sw ra, 76(a5) -; RV64IZCMP-SR-NEXT: sw s11, 72(a5) -; RV64IZCMP-SR-NEXT: sw s10, 68(a5) -; RV64IZCMP-SR-NEXT: sw s9, 64(a5) -; RV64IZCMP-SR-NEXT: sw s8, 60(a5) -; RV64IZCMP-SR-NEXT: sw s7, 56(a5) -; RV64IZCMP-SR-NEXT: sw s6, 52(a5) -; RV64IZCMP-SR-NEXT: sw s5, 48(a5) -; RV64IZCMP-SR-NEXT: sw s4, 44(a5) -; RV64IZCMP-SR-NEXT: sw s3, 40(a5) -; RV64IZCMP-SR-NEXT: sw s2, 36(a5) -; RV64IZCMP-SR-NEXT: sw t6, 32(a5) -; RV64IZCMP-SR-NEXT: sw t5, 28(a5) -; RV64IZCMP-SR-NEXT: sw t4, 24(a5) +; RV64IZCMP-SR-NEXT: lw t4, 36(a2) +; RV64IZCMP-SR-NEXT: lw t5, 40(a2) +; RV64IZCMP-SR-NEXT: lw t6, 44(a2) +; RV64IZCMP-SR-NEXT: lw s2, 48(a2) +; RV64IZCMP-SR-NEXT: lw s3, 52(a2) +; RV64IZCMP-SR-NEXT: lw s4, 56(a2) +; RV64IZCMP-SR-NEXT: lw s5, 60(a2) +; RV64IZCMP-SR-NEXT: lw s6, 64(a2) +; RV64IZCMP-SR-NEXT: lw s7, 68(a2) +; RV64IZCMP-SR-NEXT: lw s8, 72(a2) +; RV64IZCMP-SR-NEXT: lw s9, 76(a2) +; RV64IZCMP-SR-NEXT: lw s10, 80(a2) +; RV64IZCMP-SR-NEXT: lw s11, 84(a2) +; RV64IZCMP-SR-NEXT: lw ra, 88(a2) +; RV64IZCMP-SR-NEXT: lw s1, 92(a2) +; RV64IZCMP-SR-NEXT: lw t0, 96(a2) +; RV64IZCMP-SR-NEXT: lw a7, 100(a2) +; RV64IZCMP-SR-NEXT: lw a6, 104(a2) +; RV64IZCMP-SR-NEXT: lw a4, 108(a2) +; RV64IZCMP-SR-NEXT: lw s0, 112(a2) +; RV64IZCMP-SR-NEXT: lw a3, 116(a2) +; RV64IZCMP-SR-NEXT: lw a1, 120(a2) +; RV64IZCMP-SR-NEXT: lw a0, 124(a2) +; RV64IZCMP-SR-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV64IZCMP-SR-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-SR-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-SR-NEXT: sw a0, 124(a2) +; RV64IZCMP-SR-NEXT: sw a1, 120(a2) +; RV64IZCMP-SR-NEXT: sw a3, 116(a2) +; RV64IZCMP-SR-NEXT: sw s0, 112(a2) +; RV64IZCMP-SR-NEXT: sw a4, 108(a2) +; RV64IZCMP-SR-NEXT: sw a6, 104(a2) +; RV64IZCMP-SR-NEXT: sw a7, 100(a2) +; RV64IZCMP-SR-NEXT: sw t0, 96(a2) +; RV64IZCMP-SR-NEXT: sw s1, 92(a2) +; RV64IZCMP-SR-NEXT: sw ra, 88(a2) +; RV64IZCMP-SR-NEXT: sw s11, 84(a2) +; RV64IZCMP-SR-NEXT: sw s10, 80(a2) +; RV64IZCMP-SR-NEXT: sw s9, 76(a2) +; RV64IZCMP-SR-NEXT: sw s8, 72(a2) +; RV64IZCMP-SR-NEXT: sw s7, 68(a2) +; RV64IZCMP-SR-NEXT: sw s6, 64(a2) +; RV64IZCMP-SR-NEXT: sw s5, 60(a2) +; RV64IZCMP-SR-NEXT: sw s4, 56(a2) +; RV64IZCMP-SR-NEXT: sw s3, 52(a2) +; RV64IZCMP-SR-NEXT: sw s2, 48(a2) +; RV64IZCMP-SR-NEXT: sw t6, 44(a2) +; RV64IZCMP-SR-NEXT: sw t5, 40(a2) +; RV64IZCMP-SR-NEXT: sw t4, 36(a2) ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, 20(a5) +; RV64IZCMP-SR-NEXT: sw a0, 32(a2) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, 16(a5) +; RV64IZCMP-SR-NEXT: sw a0, 28(a2) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 24(a2) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 20(a2) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 16(a2) +; RV64IZCMP-SR-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-SR-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-SR-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV64IZCMP-SR-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-SR-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2289,84 +2289,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw t4, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t5, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: lui a4, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: addi a2, a4, %lo(var_test_irq) +; RV32I-NEXT: lw a0, 16(a2) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: lw a0, 20(a2) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: lw a0, 24(a2) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32I-NEXT: lw a0, 16(a5) +; RV32I-NEXT: lw a0, 28(a2) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, 20(a5) +; RV32I-NEXT: lw a0, 32(a2) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw t0, 24(a5) -; RV32I-NEXT: lw t1, 28(a5) -; RV32I-NEXT: lw t2, 32(a5) -; RV32I-NEXT: lw t3, 36(a5) -; RV32I-NEXT: lw t4, 40(a5) -; RV32I-NEXT: lw t5, 44(a5) -; RV32I-NEXT: lw t6, 48(a5) -; RV32I-NEXT: lw s0, 52(a5) -; RV32I-NEXT: lw s1, 56(a5) -; RV32I-NEXT: lw s2, 60(a5) -; RV32I-NEXT: lw s3, 64(a5) -; RV32I-NEXT: lw s4, 68(a5) -; RV32I-NEXT: lw s5, 72(a5) -; RV32I-NEXT: lw s6, 76(a5) -; RV32I-NEXT: lw s7, 80(a5) -; RV32I-NEXT: lw s8, 84(a5) -; RV32I-NEXT: lw s9, 88(a5) -; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 96(a5) -; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) -; RV32I-NEXT: lw a4, 108(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a1, 120(a5) -; RV32I-NEXT: lw a2, 116(a5) -; RV32I-NEXT: lw a3, 112(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a1, 120(a5) -; RV32I-NEXT: sw a2, 116(a5) -; RV32I-NEXT: sw a3, 112(a5) -; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) -; RV32I-NEXT: sw ra, 100(a5) -; RV32I-NEXT: sw s11, 96(a5) -; RV32I-NEXT: sw s10, 92(a5) -; RV32I-NEXT: sw s9, 88(a5) -; RV32I-NEXT: sw s8, 84(a5) -; RV32I-NEXT: sw s7, 80(a5) -; RV32I-NEXT: sw s6, 76(a5) -; RV32I-NEXT: sw s5, 72(a5) -; RV32I-NEXT: sw s4, 68(a5) -; RV32I-NEXT: sw s3, 64(a5) -; RV32I-NEXT: sw s2, 60(a5) -; RV32I-NEXT: sw s1, 56(a5) -; RV32I-NEXT: sw s0, 52(a5) -; RV32I-NEXT: sw t6, 48(a5) -; RV32I-NEXT: sw t5, 44(a5) -; RV32I-NEXT: sw t4, 40(a5) -; RV32I-NEXT: sw t3, 36(a5) -; RV32I-NEXT: sw t2, 32(a5) -; RV32I-NEXT: sw t1, 28(a5) -; RV32I-NEXT: sw t0, 24(a5) +; RV32I-NEXT: lw t0, 36(a2) +; RV32I-NEXT: lw t1, 40(a2) +; RV32I-NEXT: lw t2, 44(a2) +; RV32I-NEXT: lw t3, 48(a2) +; RV32I-NEXT: lw t4, 52(a2) +; RV32I-NEXT: lw t5, 56(a2) +; RV32I-NEXT: lw t6, 60(a2) +; RV32I-NEXT: lw s0, 64(a2) +; RV32I-NEXT: lw s1, 68(a2) +; RV32I-NEXT: lw s2, 72(a2) +; RV32I-NEXT: lw s3, 76(a2) +; RV32I-NEXT: lw s4, 80(a2) +; RV32I-NEXT: lw s5, 84(a2) +; RV32I-NEXT: lw s6, 88(a2) +; RV32I-NEXT: lw s7, 92(a2) +; RV32I-NEXT: lw s8, 96(a2) +; RV32I-NEXT: lw s9, 100(a2) +; RV32I-NEXT: lw s10, 104(a2) +; RV32I-NEXT: lw s11, 108(a2) +; RV32I-NEXT: lw ra, 112(a2) +; RV32I-NEXT: lw a3, 116(a2) +; RV32I-NEXT: lw a1, 120(a2) +; RV32I-NEXT: lw a0, 124(a2) +; RV32I-NEXT: lw a7, %lo(var_test_irq+4)(a4) +; RV32I-NEXT: lw a6, %lo(var_test_irq+8)(a4) +; RV32I-NEXT: lw a5, %lo(var_test_irq+12)(a4) +; RV32I-NEXT: sw a0, 124(a2) +; RV32I-NEXT: sw a1, 120(a2) +; RV32I-NEXT: sw a3, 116(a2) +; RV32I-NEXT: sw ra, 112(a2) +; RV32I-NEXT: sw s11, 108(a2) +; RV32I-NEXT: sw s10, 104(a2) +; RV32I-NEXT: sw s9, 100(a2) +; RV32I-NEXT: sw s8, 96(a2) +; RV32I-NEXT: sw s7, 92(a2) +; RV32I-NEXT: sw s6, 88(a2) +; RV32I-NEXT: sw s5, 84(a2) +; RV32I-NEXT: sw s4, 80(a2) +; RV32I-NEXT: sw s3, 76(a2) +; RV32I-NEXT: sw s2, 72(a2) +; RV32I-NEXT: sw s1, 68(a2) +; RV32I-NEXT: sw s0, 64(a2) +; RV32I-NEXT: sw t6, 60(a2) +; RV32I-NEXT: sw t5, 56(a2) +; RV32I-NEXT: sw t4, 52(a2) +; RV32I-NEXT: sw t3, 48(a2) +; RV32I-NEXT: sw t2, 44(a2) +; RV32I-NEXT: sw t1, 40(a2) +; RV32I-NEXT: sw t0, 36(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 20(a5) +; RV32I-NEXT: sw a0, 32(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 16(a5) +; RV32I-NEXT: sw a0, 28(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: sw a0, 24(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: sw a0, 20(a2) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: sw a0, 16(a2) +; RV32I-NEXT: sw a5, %lo(var_test_irq+12)(a4) +; RV32I-NEXT: sw a6, %lo(var_test_irq+8)(a4) +; RV32I-NEXT: sw a7, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t1, 132(sp) # 4-byte Folded Reload @@ -2429,84 +2429,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sd t4, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t5, 56(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t6, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: lui a4, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: addi a2, a4, %lo(var_test_irq) +; RV64I-NEXT: lw a0, 16(a2) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: lw a0, 20(a2) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: lw a0, 24(a2) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64I-NEXT: lw a0, 16(a5) +; RV64I-NEXT: lw a0, 28(a2) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, 20(a5) +; RV64I-NEXT: lw a0, 32(a2) ; RV64I-NEXT: sd a0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw t0, 24(a5) -; RV64I-NEXT: lw t1, 28(a5) -; RV64I-NEXT: lw t2, 32(a5) -; RV64I-NEXT: lw t3, 36(a5) -; RV64I-NEXT: lw t4, 40(a5) -; RV64I-NEXT: lw t5, 44(a5) -; RV64I-NEXT: lw t6, 48(a5) -; RV64I-NEXT: lw s0, 52(a5) -; RV64I-NEXT: lw s1, 56(a5) -; RV64I-NEXT: lw s2, 60(a5) -; RV64I-NEXT: lw s3, 64(a5) -; RV64I-NEXT: lw s4, 68(a5) -; RV64I-NEXT: lw s5, 72(a5) -; RV64I-NEXT: lw s6, 76(a5) -; RV64I-NEXT: lw s7, 80(a5) -; RV64I-NEXT: lw s8, 84(a5) -; RV64I-NEXT: lw s9, 88(a5) -; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 96(a5) -; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) -; RV64I-NEXT: lw a4, 108(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a1, 120(a5) -; RV64I-NEXT: lw a2, 116(a5) -; RV64I-NEXT: lw a3, 112(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a1, 120(a5) -; RV64I-NEXT: sw a2, 116(a5) -; RV64I-NEXT: sw a3, 112(a5) -; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) -; RV64I-NEXT: sw ra, 100(a5) -; RV64I-NEXT: sw s11, 96(a5) -; RV64I-NEXT: sw s10, 92(a5) -; RV64I-NEXT: sw s9, 88(a5) -; RV64I-NEXT: sw s8, 84(a5) -; RV64I-NEXT: sw s7, 80(a5) -; RV64I-NEXT: sw s6, 76(a5) -; RV64I-NEXT: sw s5, 72(a5) -; RV64I-NEXT: sw s4, 68(a5) -; RV64I-NEXT: sw s3, 64(a5) -; RV64I-NEXT: sw s2, 60(a5) -; RV64I-NEXT: sw s1, 56(a5) -; RV64I-NEXT: sw s0, 52(a5) -; RV64I-NEXT: sw t6, 48(a5) -; RV64I-NEXT: sw t5, 44(a5) -; RV64I-NEXT: sw t4, 40(a5) -; RV64I-NEXT: sw t3, 36(a5) -; RV64I-NEXT: sw t2, 32(a5) -; RV64I-NEXT: sw t1, 28(a5) -; RV64I-NEXT: sw t0, 24(a5) +; RV64I-NEXT: lw t0, 36(a2) +; RV64I-NEXT: lw t1, 40(a2) +; RV64I-NEXT: lw t2, 44(a2) +; RV64I-NEXT: lw t3, 48(a2) +; RV64I-NEXT: lw t4, 52(a2) +; RV64I-NEXT: lw t5, 56(a2) +; RV64I-NEXT: lw t6, 60(a2) +; RV64I-NEXT: lw s0, 64(a2) +; RV64I-NEXT: lw s1, 68(a2) +; RV64I-NEXT: lw s2, 72(a2) +; RV64I-NEXT: lw s3, 76(a2) +; RV64I-NEXT: lw s4, 80(a2) +; RV64I-NEXT: lw s5, 84(a2) +; RV64I-NEXT: lw s6, 88(a2) +; RV64I-NEXT: lw s7, 92(a2) +; RV64I-NEXT: lw s8, 96(a2) +; RV64I-NEXT: lw s9, 100(a2) +; RV64I-NEXT: lw s10, 104(a2) +; RV64I-NEXT: lw s11, 108(a2) +; RV64I-NEXT: lw ra, 112(a2) +; RV64I-NEXT: lw a3, 116(a2) +; RV64I-NEXT: lw a1, 120(a2) +; RV64I-NEXT: lw a0, 124(a2) +; RV64I-NEXT: lw a7, %lo(var_test_irq+4)(a4) +; RV64I-NEXT: lw a6, %lo(var_test_irq+8)(a4) +; RV64I-NEXT: lw a5, %lo(var_test_irq+12)(a4) +; RV64I-NEXT: sw a0, 124(a2) +; RV64I-NEXT: sw a1, 120(a2) +; RV64I-NEXT: sw a3, 116(a2) +; RV64I-NEXT: sw ra, 112(a2) +; RV64I-NEXT: sw s11, 108(a2) +; RV64I-NEXT: sw s10, 104(a2) +; RV64I-NEXT: sw s9, 100(a2) +; RV64I-NEXT: sw s8, 96(a2) +; RV64I-NEXT: sw s7, 92(a2) +; RV64I-NEXT: sw s6, 88(a2) +; RV64I-NEXT: sw s5, 84(a2) +; RV64I-NEXT: sw s4, 80(a2) +; RV64I-NEXT: sw s3, 76(a2) +; RV64I-NEXT: sw s2, 72(a2) +; RV64I-NEXT: sw s1, 68(a2) +; RV64I-NEXT: sw s0, 64(a2) +; RV64I-NEXT: sw t6, 60(a2) +; RV64I-NEXT: sw t5, 56(a2) +; RV64I-NEXT: sw t4, 52(a2) +; RV64I-NEXT: sw t3, 48(a2) +; RV64I-NEXT: sw t2, 44(a2) +; RV64I-NEXT: sw t1, 40(a2) +; RV64I-NEXT: sw t0, 36(a2) ; RV64I-NEXT: ld a0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 20(a5) +; RV64I-NEXT: sw a0, 32(a2) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 16(a5) +; RV64I-NEXT: sw a0, 28(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: sw a0, 24(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: sw a0, 20(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: sw a0, 16(a2) +; RV64I-NEXT: sw a5, %lo(var_test_irq+12)(a4) +; RV64I-NEXT: sw a6, %lo(var_test_irq+8)(a4) +; RV64I-NEXT: sw a7, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: ld ra, 264(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t0, 256(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t1, 248(sp) # 8-byte Folded Reload @@ -2546,333 +2546,333 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-LABEL: callee_no_irq: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: lui a5, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: addi a2, a5, %lo(var_test_irq) +; RV32IZCMP-NEXT: lw a0, 16(a2) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: lw a0, 20(a2) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: lw a0, 24(a2) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32IZCMP-NEXT: lw a0, 16(a5) +; RV32IZCMP-NEXT: lw a0, 28(a2) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, 20(a5) +; RV32IZCMP-NEXT: lw a0, 32(a2) ; RV32IZCMP-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw t4, 24(a5) -; RV32IZCMP-NEXT: lw t5, 28(a5) -; RV32IZCMP-NEXT: lw t6, 32(a5) -; RV32IZCMP-NEXT: lw s2, 36(a5) -; RV32IZCMP-NEXT: lw s3, 40(a5) -; RV32IZCMP-NEXT: lw s4, 44(a5) -; RV32IZCMP-NEXT: lw s5, 48(a5) -; RV32IZCMP-NEXT: lw s6, 52(a5) -; RV32IZCMP-NEXT: lw s7, 56(a5) -; RV32IZCMP-NEXT: lw s8, 60(a5) -; RV32IZCMP-NEXT: lw s9, 64(a5) -; RV32IZCMP-NEXT: lw s10, 68(a5) -; RV32IZCMP-NEXT: lw s11, 72(a5) -; RV32IZCMP-NEXT: lw ra, 76(a5) -; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw t0, 96(a5) -; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a7, 104(a5) -; RV32IZCMP-NEXT: lw a4, 108(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a1, 120(a5) -; RV32IZCMP-NEXT: lw a2, 116(a5) -; RV32IZCMP-NEXT: lw a3, 112(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a1, 120(a5) -; RV32IZCMP-NEXT: sw a2, 116(a5) -; RV32IZCMP-NEXT: sw a3, 112(a5) -; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a7, 104(a5) -; RV32IZCMP-NEXT: sw s0, 100(a5) -; RV32IZCMP-NEXT: sw t0, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) -; RV32IZCMP-NEXT: sw s1, 80(a5) -; RV32IZCMP-NEXT: sw ra, 76(a5) -; RV32IZCMP-NEXT: sw s11, 72(a5) -; RV32IZCMP-NEXT: sw s10, 68(a5) -; RV32IZCMP-NEXT: sw s9, 64(a5) -; RV32IZCMP-NEXT: sw s8, 60(a5) -; RV32IZCMP-NEXT: sw s7, 56(a5) -; RV32IZCMP-NEXT: sw s6, 52(a5) -; RV32IZCMP-NEXT: sw s5, 48(a5) -; RV32IZCMP-NEXT: sw s4, 44(a5) -; RV32IZCMP-NEXT: sw s3, 40(a5) -; RV32IZCMP-NEXT: sw s2, 36(a5) -; RV32IZCMP-NEXT: sw t6, 32(a5) -; RV32IZCMP-NEXT: sw t5, 28(a5) -; RV32IZCMP-NEXT: sw t4, 24(a5) +; RV32IZCMP-NEXT: lw t4, 36(a2) +; RV32IZCMP-NEXT: lw t5, 40(a2) +; RV32IZCMP-NEXT: lw t6, 44(a2) +; RV32IZCMP-NEXT: lw s2, 48(a2) +; RV32IZCMP-NEXT: lw s3, 52(a2) +; RV32IZCMP-NEXT: lw s4, 56(a2) +; RV32IZCMP-NEXT: lw s5, 60(a2) +; RV32IZCMP-NEXT: lw s6, 64(a2) +; RV32IZCMP-NEXT: lw s7, 68(a2) +; RV32IZCMP-NEXT: lw s8, 72(a2) +; RV32IZCMP-NEXT: lw s9, 76(a2) +; RV32IZCMP-NEXT: lw s10, 80(a2) +; RV32IZCMP-NEXT: lw s11, 84(a2) +; RV32IZCMP-NEXT: lw ra, 88(a2) +; RV32IZCMP-NEXT: lw s1, 92(a2) +; RV32IZCMP-NEXT: lw t0, 96(a2) +; RV32IZCMP-NEXT: lw a7, 100(a2) +; RV32IZCMP-NEXT: lw a6, 104(a2) +; RV32IZCMP-NEXT: lw a4, 108(a2) +; RV32IZCMP-NEXT: lw s0, 112(a2) +; RV32IZCMP-NEXT: lw a3, 116(a2) +; RV32IZCMP-NEXT: lw a1, 120(a2) +; RV32IZCMP-NEXT: lw a0, 124(a2) +; RV32IZCMP-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV32IZCMP-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-NEXT: sw a0, 124(a2) +; RV32IZCMP-NEXT: sw a1, 120(a2) +; RV32IZCMP-NEXT: sw a3, 116(a2) +; RV32IZCMP-NEXT: sw s0, 112(a2) +; RV32IZCMP-NEXT: sw a4, 108(a2) +; RV32IZCMP-NEXT: sw a6, 104(a2) +; RV32IZCMP-NEXT: sw a7, 100(a2) +; RV32IZCMP-NEXT: sw t0, 96(a2) +; RV32IZCMP-NEXT: sw s1, 92(a2) +; RV32IZCMP-NEXT: sw ra, 88(a2) +; RV32IZCMP-NEXT: sw s11, 84(a2) +; RV32IZCMP-NEXT: sw s10, 80(a2) +; RV32IZCMP-NEXT: sw s9, 76(a2) +; RV32IZCMP-NEXT: sw s8, 72(a2) +; RV32IZCMP-NEXT: sw s7, 68(a2) +; RV32IZCMP-NEXT: sw s6, 64(a2) +; RV32IZCMP-NEXT: sw s5, 60(a2) +; RV32IZCMP-NEXT: sw s4, 56(a2) +; RV32IZCMP-NEXT: sw s3, 52(a2) +; RV32IZCMP-NEXT: sw s2, 48(a2) +; RV32IZCMP-NEXT: sw t6, 44(a2) +; RV32IZCMP-NEXT: sw t5, 40(a2) +; RV32IZCMP-NEXT: sw t4, 36(a2) ; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 20(a5) +; RV32IZCMP-NEXT: sw a0, 32(a2) ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, 16(a5) +; RV32IZCMP-NEXT: sw a0, 28(a2) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-NEXT: sw a0, 24(a2) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-NEXT: sw a0, 20(a2) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-NEXT: sw a0, 16(a2) +; RV32IZCMP-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-LABEL: callee_no_irq: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: lui a5, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: addi a2, a5, %lo(var_test_irq) +; RV64IZCMP-NEXT: lw a0, 16(a2) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: lw a0, 20(a2) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: lw a0, 24(a2) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64IZCMP-NEXT: lw a0, 16(a5) +; RV64IZCMP-NEXT: lw a0, 28(a2) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, 20(a5) +; RV64IZCMP-NEXT: lw a0, 32(a2) ; RV64IZCMP-NEXT: sd a0, 0(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw t4, 24(a5) -; RV64IZCMP-NEXT: lw t5, 28(a5) -; RV64IZCMP-NEXT: lw t6, 32(a5) -; RV64IZCMP-NEXT: lw s2, 36(a5) -; RV64IZCMP-NEXT: lw s3, 40(a5) -; RV64IZCMP-NEXT: lw s4, 44(a5) -; RV64IZCMP-NEXT: lw s5, 48(a5) -; RV64IZCMP-NEXT: lw s6, 52(a5) -; RV64IZCMP-NEXT: lw s7, 56(a5) -; RV64IZCMP-NEXT: lw s8, 60(a5) -; RV64IZCMP-NEXT: lw s9, 64(a5) -; RV64IZCMP-NEXT: lw s10, 68(a5) -; RV64IZCMP-NEXT: lw s11, 72(a5) -; RV64IZCMP-NEXT: lw ra, 76(a5) -; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw t0, 96(a5) -; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a7, 104(a5) -; RV64IZCMP-NEXT: lw a4, 108(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a1, 120(a5) -; RV64IZCMP-NEXT: lw a2, 116(a5) -; RV64IZCMP-NEXT: lw a3, 112(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a1, 120(a5) -; RV64IZCMP-NEXT: sw a2, 116(a5) -; RV64IZCMP-NEXT: sw a3, 112(a5) -; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a7, 104(a5) -; RV64IZCMP-NEXT: sw s0, 100(a5) -; RV64IZCMP-NEXT: sw t0, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) -; RV64IZCMP-NEXT: sw s1, 80(a5) -; RV64IZCMP-NEXT: sw ra, 76(a5) -; RV64IZCMP-NEXT: sw s11, 72(a5) -; RV64IZCMP-NEXT: sw s10, 68(a5) -; RV64IZCMP-NEXT: sw s9, 64(a5) -; RV64IZCMP-NEXT: sw s8, 60(a5) -; RV64IZCMP-NEXT: sw s7, 56(a5) -; RV64IZCMP-NEXT: sw s6, 52(a5) -; RV64IZCMP-NEXT: sw s5, 48(a5) -; RV64IZCMP-NEXT: sw s4, 44(a5) -; RV64IZCMP-NEXT: sw s3, 40(a5) -; RV64IZCMP-NEXT: sw s2, 36(a5) -; RV64IZCMP-NEXT: sw t6, 32(a5) -; RV64IZCMP-NEXT: sw t5, 28(a5) -; RV64IZCMP-NEXT: sw t4, 24(a5) +; RV64IZCMP-NEXT: lw t4, 36(a2) +; RV64IZCMP-NEXT: lw t5, 40(a2) +; RV64IZCMP-NEXT: lw t6, 44(a2) +; RV64IZCMP-NEXT: lw s2, 48(a2) +; RV64IZCMP-NEXT: lw s3, 52(a2) +; RV64IZCMP-NEXT: lw s4, 56(a2) +; RV64IZCMP-NEXT: lw s5, 60(a2) +; RV64IZCMP-NEXT: lw s6, 64(a2) +; RV64IZCMP-NEXT: lw s7, 68(a2) +; RV64IZCMP-NEXT: lw s8, 72(a2) +; RV64IZCMP-NEXT: lw s9, 76(a2) +; RV64IZCMP-NEXT: lw s10, 80(a2) +; RV64IZCMP-NEXT: lw s11, 84(a2) +; RV64IZCMP-NEXT: lw ra, 88(a2) +; RV64IZCMP-NEXT: lw s1, 92(a2) +; RV64IZCMP-NEXT: lw t0, 96(a2) +; RV64IZCMP-NEXT: lw a7, 100(a2) +; RV64IZCMP-NEXT: lw a6, 104(a2) +; RV64IZCMP-NEXT: lw a4, 108(a2) +; RV64IZCMP-NEXT: lw s0, 112(a2) +; RV64IZCMP-NEXT: lw a3, 116(a2) +; RV64IZCMP-NEXT: lw a1, 120(a2) +; RV64IZCMP-NEXT: lw a0, 124(a2) +; RV64IZCMP-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV64IZCMP-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-NEXT: sw a0, 124(a2) +; RV64IZCMP-NEXT: sw a1, 120(a2) +; RV64IZCMP-NEXT: sw a3, 116(a2) +; RV64IZCMP-NEXT: sw s0, 112(a2) +; RV64IZCMP-NEXT: sw a4, 108(a2) +; RV64IZCMP-NEXT: sw a6, 104(a2) +; RV64IZCMP-NEXT: sw a7, 100(a2) +; RV64IZCMP-NEXT: sw t0, 96(a2) +; RV64IZCMP-NEXT: sw s1, 92(a2) +; RV64IZCMP-NEXT: sw ra, 88(a2) +; RV64IZCMP-NEXT: sw s11, 84(a2) +; RV64IZCMP-NEXT: sw s10, 80(a2) +; RV64IZCMP-NEXT: sw s9, 76(a2) +; RV64IZCMP-NEXT: sw s8, 72(a2) +; RV64IZCMP-NEXT: sw s7, 68(a2) +; RV64IZCMP-NEXT: sw s6, 64(a2) +; RV64IZCMP-NEXT: sw s5, 60(a2) +; RV64IZCMP-NEXT: sw s4, 56(a2) +; RV64IZCMP-NEXT: sw s3, 52(a2) +; RV64IZCMP-NEXT: sw s2, 48(a2) +; RV64IZCMP-NEXT: sw t6, 44(a2) +; RV64IZCMP-NEXT: sw t5, 40(a2) +; RV64IZCMP-NEXT: sw t4, 36(a2) ; RV64IZCMP-NEXT: ld a0, 0(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 20(a5) +; RV64IZCMP-NEXT: sw a0, 32(a2) ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, 16(a5) +; RV64IZCMP-NEXT: sw a0, 28(a2) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-NEXT: sw a0, 24(a2) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-NEXT: sw a0, 20(a2) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-NEXT: sw a0, 16(a2) +; RV64IZCMP-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32IZCMP-SR-LABEL: callee_no_irq: ; RV32IZCMP-SR: # %bb.0: ; RV32IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: lui a5, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: addi a2, a5, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, 16(a2) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: lw a0, 20(a2) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: lw a0, 24(a2) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, 16(a5) +; RV32IZCMP-SR-NEXT: lw a0, 28(a2) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, 20(a5) +; RV32IZCMP-SR-NEXT: lw a0, 32(a2) ; RV32IZCMP-SR-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw t4, 24(a5) -; RV32IZCMP-SR-NEXT: lw t5, 28(a5) -; RV32IZCMP-SR-NEXT: lw t6, 32(a5) -; RV32IZCMP-SR-NEXT: lw s2, 36(a5) -; RV32IZCMP-SR-NEXT: lw s3, 40(a5) -; RV32IZCMP-SR-NEXT: lw s4, 44(a5) -; RV32IZCMP-SR-NEXT: lw s5, 48(a5) -; RV32IZCMP-SR-NEXT: lw s6, 52(a5) -; RV32IZCMP-SR-NEXT: lw s7, 56(a5) -; RV32IZCMP-SR-NEXT: lw s8, 60(a5) -; RV32IZCMP-SR-NEXT: lw s9, 64(a5) -; RV32IZCMP-SR-NEXT: lw s10, 68(a5) -; RV32IZCMP-SR-NEXT: lw s11, 72(a5) -; RV32IZCMP-SR-NEXT: lw ra, 76(a5) -; RV32IZCMP-SR-NEXT: lw s1, 80(a5) -; RV32IZCMP-SR-NEXT: lw t3, 84(a5) -; RV32IZCMP-SR-NEXT: lw t2, 88(a5) -; RV32IZCMP-SR-NEXT: lw t1, 92(a5) -; RV32IZCMP-SR-NEXT: lw t0, 96(a5) -; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a7, 104(a5) -; RV32IZCMP-SR-NEXT: lw a4, 108(a5) -; RV32IZCMP-SR-NEXT: lw a0, 124(a5) -; RV32IZCMP-SR-NEXT: lw a1, 120(a5) -; RV32IZCMP-SR-NEXT: lw a2, 116(a5) -; RV32IZCMP-SR-NEXT: lw a3, 112(a5) -; RV32IZCMP-SR-NEXT: sw a0, 124(a5) -; RV32IZCMP-SR-NEXT: sw a1, 120(a5) -; RV32IZCMP-SR-NEXT: sw a2, 116(a5) -; RV32IZCMP-SR-NEXT: sw a3, 112(a5) -; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a7, 104(a5) -; RV32IZCMP-SR-NEXT: sw s0, 100(a5) -; RV32IZCMP-SR-NEXT: sw t0, 96(a5) -; RV32IZCMP-SR-NEXT: sw t1, 92(a5) -; RV32IZCMP-SR-NEXT: sw t2, 88(a5) -; RV32IZCMP-SR-NEXT: sw t3, 84(a5) -; RV32IZCMP-SR-NEXT: sw s1, 80(a5) -; RV32IZCMP-SR-NEXT: sw ra, 76(a5) -; RV32IZCMP-SR-NEXT: sw s11, 72(a5) -; RV32IZCMP-SR-NEXT: sw s10, 68(a5) -; RV32IZCMP-SR-NEXT: sw s9, 64(a5) -; RV32IZCMP-SR-NEXT: sw s8, 60(a5) -; RV32IZCMP-SR-NEXT: sw s7, 56(a5) -; RV32IZCMP-SR-NEXT: sw s6, 52(a5) -; RV32IZCMP-SR-NEXT: sw s5, 48(a5) -; RV32IZCMP-SR-NEXT: sw s4, 44(a5) -; RV32IZCMP-SR-NEXT: sw s3, 40(a5) -; RV32IZCMP-SR-NEXT: sw s2, 36(a5) -; RV32IZCMP-SR-NEXT: sw t6, 32(a5) -; RV32IZCMP-SR-NEXT: sw t5, 28(a5) -; RV32IZCMP-SR-NEXT: sw t4, 24(a5) +; RV32IZCMP-SR-NEXT: lw t4, 36(a2) +; RV32IZCMP-SR-NEXT: lw t5, 40(a2) +; RV32IZCMP-SR-NEXT: lw t6, 44(a2) +; RV32IZCMP-SR-NEXT: lw s2, 48(a2) +; RV32IZCMP-SR-NEXT: lw s3, 52(a2) +; RV32IZCMP-SR-NEXT: lw s4, 56(a2) +; RV32IZCMP-SR-NEXT: lw s5, 60(a2) +; RV32IZCMP-SR-NEXT: lw s6, 64(a2) +; RV32IZCMP-SR-NEXT: lw s7, 68(a2) +; RV32IZCMP-SR-NEXT: lw s8, 72(a2) +; RV32IZCMP-SR-NEXT: lw s9, 76(a2) +; RV32IZCMP-SR-NEXT: lw s10, 80(a2) +; RV32IZCMP-SR-NEXT: lw s11, 84(a2) +; RV32IZCMP-SR-NEXT: lw ra, 88(a2) +; RV32IZCMP-SR-NEXT: lw s1, 92(a2) +; RV32IZCMP-SR-NEXT: lw t0, 96(a2) +; RV32IZCMP-SR-NEXT: lw a7, 100(a2) +; RV32IZCMP-SR-NEXT: lw a6, 104(a2) +; RV32IZCMP-SR-NEXT: lw a4, 108(a2) +; RV32IZCMP-SR-NEXT: lw s0, 112(a2) +; RV32IZCMP-SR-NEXT: lw a3, 116(a2) +; RV32IZCMP-SR-NEXT: lw a1, 120(a2) +; RV32IZCMP-SR-NEXT: lw a0, 124(a2) +; RV32IZCMP-SR-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV32IZCMP-SR-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-SR-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-SR-NEXT: sw a0, 124(a2) +; RV32IZCMP-SR-NEXT: sw a1, 120(a2) +; RV32IZCMP-SR-NEXT: sw a3, 116(a2) +; RV32IZCMP-SR-NEXT: sw s0, 112(a2) +; RV32IZCMP-SR-NEXT: sw a4, 108(a2) +; RV32IZCMP-SR-NEXT: sw a6, 104(a2) +; RV32IZCMP-SR-NEXT: sw a7, 100(a2) +; RV32IZCMP-SR-NEXT: sw t0, 96(a2) +; RV32IZCMP-SR-NEXT: sw s1, 92(a2) +; RV32IZCMP-SR-NEXT: sw ra, 88(a2) +; RV32IZCMP-SR-NEXT: sw s11, 84(a2) +; RV32IZCMP-SR-NEXT: sw s10, 80(a2) +; RV32IZCMP-SR-NEXT: sw s9, 76(a2) +; RV32IZCMP-SR-NEXT: sw s8, 72(a2) +; RV32IZCMP-SR-NEXT: sw s7, 68(a2) +; RV32IZCMP-SR-NEXT: sw s6, 64(a2) +; RV32IZCMP-SR-NEXT: sw s5, 60(a2) +; RV32IZCMP-SR-NEXT: sw s4, 56(a2) +; RV32IZCMP-SR-NEXT: sw s3, 52(a2) +; RV32IZCMP-SR-NEXT: sw s2, 48(a2) +; RV32IZCMP-SR-NEXT: sw t6, 44(a2) +; RV32IZCMP-SR-NEXT: sw t5, 40(a2) +; RV32IZCMP-SR-NEXT: sw t4, 36(a2) ; RV32IZCMP-SR-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, 20(a5) +; RV32IZCMP-SR-NEXT: sw a0, 32(a2) ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, 16(a5) +; RV32IZCMP-SR-NEXT: sw a0, 28(a2) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 24(a2) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 20(a2) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32IZCMP-SR-NEXT: sw a0, 16(a2) +; RV32IZCMP-SR-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV32IZCMP-SR-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV32IZCMP-SR-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV32IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-SR-LABEL: callee_no_irq: ; RV64IZCMP-SR: # %bb.0: ; RV64IZCMP-SR-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: lui a5, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: addi a2, a5, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, 16(a2) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: lw a0, 20(a2) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: lw a0, 24(a2) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, 16(a5) +; RV64IZCMP-SR-NEXT: lw a0, 28(a2) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, 20(a5) +; RV64IZCMP-SR-NEXT: lw a0, 32(a2) ; RV64IZCMP-SR-NEXT: sd a0, 0(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw t4, 24(a5) -; RV64IZCMP-SR-NEXT: lw t5, 28(a5) -; RV64IZCMP-SR-NEXT: lw t6, 32(a5) -; RV64IZCMP-SR-NEXT: lw s2, 36(a5) -; RV64IZCMP-SR-NEXT: lw s3, 40(a5) -; RV64IZCMP-SR-NEXT: lw s4, 44(a5) -; RV64IZCMP-SR-NEXT: lw s5, 48(a5) -; RV64IZCMP-SR-NEXT: lw s6, 52(a5) -; RV64IZCMP-SR-NEXT: lw s7, 56(a5) -; RV64IZCMP-SR-NEXT: lw s8, 60(a5) -; RV64IZCMP-SR-NEXT: lw s9, 64(a5) -; RV64IZCMP-SR-NEXT: lw s10, 68(a5) -; RV64IZCMP-SR-NEXT: lw s11, 72(a5) -; RV64IZCMP-SR-NEXT: lw ra, 76(a5) -; RV64IZCMP-SR-NEXT: lw s1, 80(a5) -; RV64IZCMP-SR-NEXT: lw t3, 84(a5) -; RV64IZCMP-SR-NEXT: lw t2, 88(a5) -; RV64IZCMP-SR-NEXT: lw t1, 92(a5) -; RV64IZCMP-SR-NEXT: lw t0, 96(a5) -; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a7, 104(a5) -; RV64IZCMP-SR-NEXT: lw a4, 108(a5) -; RV64IZCMP-SR-NEXT: lw a0, 124(a5) -; RV64IZCMP-SR-NEXT: lw a1, 120(a5) -; RV64IZCMP-SR-NEXT: lw a2, 116(a5) -; RV64IZCMP-SR-NEXT: lw a3, 112(a5) -; RV64IZCMP-SR-NEXT: sw a0, 124(a5) -; RV64IZCMP-SR-NEXT: sw a1, 120(a5) -; RV64IZCMP-SR-NEXT: sw a2, 116(a5) -; RV64IZCMP-SR-NEXT: sw a3, 112(a5) -; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a7, 104(a5) -; RV64IZCMP-SR-NEXT: sw s0, 100(a5) -; RV64IZCMP-SR-NEXT: sw t0, 96(a5) -; RV64IZCMP-SR-NEXT: sw t1, 92(a5) -; RV64IZCMP-SR-NEXT: sw t2, 88(a5) -; RV64IZCMP-SR-NEXT: sw t3, 84(a5) -; RV64IZCMP-SR-NEXT: sw s1, 80(a5) -; RV64IZCMP-SR-NEXT: sw ra, 76(a5) -; RV64IZCMP-SR-NEXT: sw s11, 72(a5) -; RV64IZCMP-SR-NEXT: sw s10, 68(a5) -; RV64IZCMP-SR-NEXT: sw s9, 64(a5) -; RV64IZCMP-SR-NEXT: sw s8, 60(a5) -; RV64IZCMP-SR-NEXT: sw s7, 56(a5) -; RV64IZCMP-SR-NEXT: sw s6, 52(a5) -; RV64IZCMP-SR-NEXT: sw s5, 48(a5) -; RV64IZCMP-SR-NEXT: sw s4, 44(a5) -; RV64IZCMP-SR-NEXT: sw s3, 40(a5) -; RV64IZCMP-SR-NEXT: sw s2, 36(a5) -; RV64IZCMP-SR-NEXT: sw t6, 32(a5) -; RV64IZCMP-SR-NEXT: sw t5, 28(a5) -; RV64IZCMP-SR-NEXT: sw t4, 24(a5) +; RV64IZCMP-SR-NEXT: lw t4, 36(a2) +; RV64IZCMP-SR-NEXT: lw t5, 40(a2) +; RV64IZCMP-SR-NEXT: lw t6, 44(a2) +; RV64IZCMP-SR-NEXT: lw s2, 48(a2) +; RV64IZCMP-SR-NEXT: lw s3, 52(a2) +; RV64IZCMP-SR-NEXT: lw s4, 56(a2) +; RV64IZCMP-SR-NEXT: lw s5, 60(a2) +; RV64IZCMP-SR-NEXT: lw s6, 64(a2) +; RV64IZCMP-SR-NEXT: lw s7, 68(a2) +; RV64IZCMP-SR-NEXT: lw s8, 72(a2) +; RV64IZCMP-SR-NEXT: lw s9, 76(a2) +; RV64IZCMP-SR-NEXT: lw s10, 80(a2) +; RV64IZCMP-SR-NEXT: lw s11, 84(a2) +; RV64IZCMP-SR-NEXT: lw ra, 88(a2) +; RV64IZCMP-SR-NEXT: lw s1, 92(a2) +; RV64IZCMP-SR-NEXT: lw t0, 96(a2) +; RV64IZCMP-SR-NEXT: lw a7, 100(a2) +; RV64IZCMP-SR-NEXT: lw a6, 104(a2) +; RV64IZCMP-SR-NEXT: lw a4, 108(a2) +; RV64IZCMP-SR-NEXT: lw s0, 112(a2) +; RV64IZCMP-SR-NEXT: lw a3, 116(a2) +; RV64IZCMP-SR-NEXT: lw a1, 120(a2) +; RV64IZCMP-SR-NEXT: lw a0, 124(a2) +; RV64IZCMP-SR-NEXT: lw t3, %lo(var_test_irq+4)(a5) +; RV64IZCMP-SR-NEXT: lw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-SR-NEXT: lw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-SR-NEXT: sw a0, 124(a2) +; RV64IZCMP-SR-NEXT: sw a1, 120(a2) +; RV64IZCMP-SR-NEXT: sw a3, 116(a2) +; RV64IZCMP-SR-NEXT: sw s0, 112(a2) +; RV64IZCMP-SR-NEXT: sw a4, 108(a2) +; RV64IZCMP-SR-NEXT: sw a6, 104(a2) +; RV64IZCMP-SR-NEXT: sw a7, 100(a2) +; RV64IZCMP-SR-NEXT: sw t0, 96(a2) +; RV64IZCMP-SR-NEXT: sw s1, 92(a2) +; RV64IZCMP-SR-NEXT: sw ra, 88(a2) +; RV64IZCMP-SR-NEXT: sw s11, 84(a2) +; RV64IZCMP-SR-NEXT: sw s10, 80(a2) +; RV64IZCMP-SR-NEXT: sw s9, 76(a2) +; RV64IZCMP-SR-NEXT: sw s8, 72(a2) +; RV64IZCMP-SR-NEXT: sw s7, 68(a2) +; RV64IZCMP-SR-NEXT: sw s6, 64(a2) +; RV64IZCMP-SR-NEXT: sw s5, 60(a2) +; RV64IZCMP-SR-NEXT: sw s4, 56(a2) +; RV64IZCMP-SR-NEXT: sw s3, 52(a2) +; RV64IZCMP-SR-NEXT: sw s2, 48(a2) +; RV64IZCMP-SR-NEXT: sw t6, 44(a2) +; RV64IZCMP-SR-NEXT: sw t5, 40(a2) +; RV64IZCMP-SR-NEXT: sw t4, 36(a2) ; RV64IZCMP-SR-NEXT: ld a0, 0(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, 20(a5) +; RV64IZCMP-SR-NEXT: sw a0, 32(a2) ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, 16(a5) +; RV64IZCMP-SR-NEXT: sw a0, 28(a2) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 24(a2) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 20(a2) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64IZCMP-SR-NEXT: sw a0, 16(a2) +; RV64IZCMP-SR-NEXT: sw t1, %lo(var_test_irq+12)(a5) +; RV64IZCMP-SR-NEXT: sw t2, %lo(var_test_irq+8)(a5) +; RV64IZCMP-SR-NEXT: sw t3, %lo(var_test_irq+4)(a5) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a5) ; RV64IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32I-LABEL: callee_no_irq: @@ -2891,84 +2891,84 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a6, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: lui a4, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: addi a2, a4, %lo(var_test_irq) +; RV32I-NEXT: lw a0, 16(a2) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: lw a0, 20(a2) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: lw a0, 24(a2) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) -; RV32I-NEXT: lw a0, 16(a5) +; RV32I-NEXT: lw a0, 28(a2) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, 20(a5) +; RV32I-NEXT: lw a0, 32(a2) ; RV32I-NEXT: sw a0, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw t0, 24(a5) -; RV32I-NEXT: lw t1, 28(a5) -; RV32I-NEXT: lw t2, 32(a5) -; RV32I-NEXT: lw t3, 36(a5) -; RV32I-NEXT: lw t4, 40(a5) -; RV32I-NEXT: lw t5, 44(a5) -; RV32I-NEXT: lw t6, 48(a5) -; RV32I-NEXT: lw s0, 52(a5) -; RV32I-NEXT: lw s1, 56(a5) -; RV32I-NEXT: lw s2, 60(a5) -; RV32I-NEXT: lw s3, 64(a5) -; RV32I-NEXT: lw s4, 68(a5) -; RV32I-NEXT: lw s5, 72(a5) -; RV32I-NEXT: lw s6, 76(a5) -; RV32I-NEXT: lw s7, 80(a5) -; RV32I-NEXT: lw s8, 84(a5) -; RV32I-NEXT: lw s9, 88(a5) -; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 96(a5) -; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a7, 104(a5) -; RV32I-NEXT: lw a4, 108(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a1, 120(a5) -; RV32I-NEXT: lw a2, 116(a5) -; RV32I-NEXT: lw a3, 112(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a1, 120(a5) -; RV32I-NEXT: sw a2, 116(a5) -; RV32I-NEXT: sw a3, 112(a5) -; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a7, 104(a5) -; RV32I-NEXT: sw ra, 100(a5) -; RV32I-NEXT: sw s11, 96(a5) -; RV32I-NEXT: sw s10, 92(a5) -; RV32I-NEXT: sw s9, 88(a5) -; RV32I-NEXT: sw s8, 84(a5) -; RV32I-NEXT: sw s7, 80(a5) -; RV32I-NEXT: sw s6, 76(a5) -; RV32I-NEXT: sw s5, 72(a5) -; RV32I-NEXT: sw s4, 68(a5) -; RV32I-NEXT: sw s3, 64(a5) -; RV32I-NEXT: sw s2, 60(a5) -; RV32I-NEXT: sw s1, 56(a5) -; RV32I-NEXT: sw s0, 52(a5) -; RV32I-NEXT: sw t6, 48(a5) -; RV32I-NEXT: sw t5, 44(a5) -; RV32I-NEXT: sw t4, 40(a5) -; RV32I-NEXT: sw t3, 36(a5) -; RV32I-NEXT: sw t2, 32(a5) -; RV32I-NEXT: sw t1, 28(a5) -; RV32I-NEXT: sw t0, 24(a5) +; RV32I-NEXT: lw t0, 36(a2) +; RV32I-NEXT: lw t1, 40(a2) +; RV32I-NEXT: lw t2, 44(a2) +; RV32I-NEXT: lw t3, 48(a2) +; RV32I-NEXT: lw t4, 52(a2) +; RV32I-NEXT: lw t5, 56(a2) +; RV32I-NEXT: lw t6, 60(a2) +; RV32I-NEXT: lw s0, 64(a2) +; RV32I-NEXT: lw s1, 68(a2) +; RV32I-NEXT: lw s2, 72(a2) +; RV32I-NEXT: lw s3, 76(a2) +; RV32I-NEXT: lw s4, 80(a2) +; RV32I-NEXT: lw s5, 84(a2) +; RV32I-NEXT: lw s6, 88(a2) +; RV32I-NEXT: lw s7, 92(a2) +; RV32I-NEXT: lw s8, 96(a2) +; RV32I-NEXT: lw s9, 100(a2) +; RV32I-NEXT: lw s10, 104(a2) +; RV32I-NEXT: lw s11, 108(a2) +; RV32I-NEXT: lw ra, 112(a2) +; RV32I-NEXT: lw a3, 116(a2) +; RV32I-NEXT: lw a1, 120(a2) +; RV32I-NEXT: lw a0, 124(a2) +; RV32I-NEXT: lw a7, %lo(var_test_irq+4)(a4) +; RV32I-NEXT: lw a6, %lo(var_test_irq+8)(a4) +; RV32I-NEXT: lw a5, %lo(var_test_irq+12)(a4) +; RV32I-NEXT: sw a0, 124(a2) +; RV32I-NEXT: sw a1, 120(a2) +; RV32I-NEXT: sw a3, 116(a2) +; RV32I-NEXT: sw ra, 112(a2) +; RV32I-NEXT: sw s11, 108(a2) +; RV32I-NEXT: sw s10, 104(a2) +; RV32I-NEXT: sw s9, 100(a2) +; RV32I-NEXT: sw s8, 96(a2) +; RV32I-NEXT: sw s7, 92(a2) +; RV32I-NEXT: sw s6, 88(a2) +; RV32I-NEXT: sw s5, 84(a2) +; RV32I-NEXT: sw s4, 80(a2) +; RV32I-NEXT: sw s3, 76(a2) +; RV32I-NEXT: sw s2, 72(a2) +; RV32I-NEXT: sw s1, 68(a2) +; RV32I-NEXT: sw s0, 64(a2) +; RV32I-NEXT: sw t6, 60(a2) +; RV32I-NEXT: sw t5, 56(a2) +; RV32I-NEXT: sw t4, 52(a2) +; RV32I-NEXT: sw t3, 48(a2) +; RV32I-NEXT: sw t2, 44(a2) +; RV32I-NEXT: sw t1, 40(a2) +; RV32I-NEXT: sw t0, 36(a2) ; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 20(a5) +; RV32I-NEXT: sw a0, 32(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, 16(a5) +; RV32I-NEXT: sw a0, 28(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV32I-NEXT: sw a0, 24(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV32I-NEXT: sw a0, 20(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV32I-NEXT: sw a0, 16(a2) +; RV32I-NEXT: sw a5, %lo(var_test_irq+12)(a4) +; RV32I-NEXT: sw a6, %lo(var_test_irq+8)(a4) +; RV32I-NEXT: sw a7, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -3001,84 +3001,84 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a6, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: lui a4, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: addi a2, a4, %lo(var_test_irq) +; RV64I-NEXT: lw a0, 16(a2) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: lw a0, 20(a2) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: lw a0, 24(a2) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) -; RV64I-NEXT: lw a0, 16(a5) +; RV64I-NEXT: lw a0, 28(a2) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, 20(a5) +; RV64I-NEXT: lw a0, 32(a2) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw t0, 24(a5) -; RV64I-NEXT: lw t1, 28(a5) -; RV64I-NEXT: lw t2, 32(a5) -; RV64I-NEXT: lw t3, 36(a5) -; RV64I-NEXT: lw t4, 40(a5) -; RV64I-NEXT: lw t5, 44(a5) -; RV64I-NEXT: lw t6, 48(a5) -; RV64I-NEXT: lw s0, 52(a5) -; RV64I-NEXT: lw s1, 56(a5) -; RV64I-NEXT: lw s2, 60(a5) -; RV64I-NEXT: lw s3, 64(a5) -; RV64I-NEXT: lw s4, 68(a5) -; RV64I-NEXT: lw s5, 72(a5) -; RV64I-NEXT: lw s6, 76(a5) -; RV64I-NEXT: lw s7, 80(a5) -; RV64I-NEXT: lw s8, 84(a5) -; RV64I-NEXT: lw s9, 88(a5) -; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 96(a5) -; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a7, 104(a5) -; RV64I-NEXT: lw a4, 108(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a1, 120(a5) -; RV64I-NEXT: lw a2, 116(a5) -; RV64I-NEXT: lw a3, 112(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a1, 120(a5) -; RV64I-NEXT: sw a2, 116(a5) -; RV64I-NEXT: sw a3, 112(a5) -; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a7, 104(a5) -; RV64I-NEXT: sw ra, 100(a5) -; RV64I-NEXT: sw s11, 96(a5) -; RV64I-NEXT: sw s10, 92(a5) -; RV64I-NEXT: sw s9, 88(a5) -; RV64I-NEXT: sw s8, 84(a5) -; RV64I-NEXT: sw s7, 80(a5) -; RV64I-NEXT: sw s6, 76(a5) -; RV64I-NEXT: sw s5, 72(a5) -; RV64I-NEXT: sw s4, 68(a5) -; RV64I-NEXT: sw s3, 64(a5) -; RV64I-NEXT: sw s2, 60(a5) -; RV64I-NEXT: sw s1, 56(a5) -; RV64I-NEXT: sw s0, 52(a5) -; RV64I-NEXT: sw t6, 48(a5) -; RV64I-NEXT: sw t5, 44(a5) -; RV64I-NEXT: sw t4, 40(a5) -; RV64I-NEXT: sw t3, 36(a5) -; RV64I-NEXT: sw t2, 32(a5) -; RV64I-NEXT: sw t1, 28(a5) -; RV64I-NEXT: sw t0, 24(a5) +; RV64I-NEXT: lw t0, 36(a2) +; RV64I-NEXT: lw t1, 40(a2) +; RV64I-NEXT: lw t2, 44(a2) +; RV64I-NEXT: lw t3, 48(a2) +; RV64I-NEXT: lw t4, 52(a2) +; RV64I-NEXT: lw t5, 56(a2) +; RV64I-NEXT: lw t6, 60(a2) +; RV64I-NEXT: lw s0, 64(a2) +; RV64I-NEXT: lw s1, 68(a2) +; RV64I-NEXT: lw s2, 72(a2) +; RV64I-NEXT: lw s3, 76(a2) +; RV64I-NEXT: lw s4, 80(a2) +; RV64I-NEXT: lw s5, 84(a2) +; RV64I-NEXT: lw s6, 88(a2) +; RV64I-NEXT: lw s7, 92(a2) +; RV64I-NEXT: lw s8, 96(a2) +; RV64I-NEXT: lw s9, 100(a2) +; RV64I-NEXT: lw s10, 104(a2) +; RV64I-NEXT: lw s11, 108(a2) +; RV64I-NEXT: lw ra, 112(a2) +; RV64I-NEXT: lw a3, 116(a2) +; RV64I-NEXT: lw a1, 120(a2) +; RV64I-NEXT: lw a0, 124(a2) +; RV64I-NEXT: lw a7, %lo(var_test_irq+4)(a4) +; RV64I-NEXT: lw a6, %lo(var_test_irq+8)(a4) +; RV64I-NEXT: lw a5, %lo(var_test_irq+12)(a4) +; RV64I-NEXT: sw a0, 124(a2) +; RV64I-NEXT: sw a1, 120(a2) +; RV64I-NEXT: sw a3, 116(a2) +; RV64I-NEXT: sw ra, 112(a2) +; RV64I-NEXT: sw s11, 108(a2) +; RV64I-NEXT: sw s10, 104(a2) +; RV64I-NEXT: sw s9, 100(a2) +; RV64I-NEXT: sw s8, 96(a2) +; RV64I-NEXT: sw s7, 92(a2) +; RV64I-NEXT: sw s6, 88(a2) +; RV64I-NEXT: sw s5, 84(a2) +; RV64I-NEXT: sw s4, 80(a2) +; RV64I-NEXT: sw s3, 76(a2) +; RV64I-NEXT: sw s2, 72(a2) +; RV64I-NEXT: sw s1, 68(a2) +; RV64I-NEXT: sw s0, 64(a2) +; RV64I-NEXT: sw t6, 60(a2) +; RV64I-NEXT: sw t5, 56(a2) +; RV64I-NEXT: sw t4, 52(a2) +; RV64I-NEXT: sw t3, 48(a2) +; RV64I-NEXT: sw t2, 44(a2) +; RV64I-NEXT: sw t1, 40(a2) +; RV64I-NEXT: sw t0, 36(a2) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 20(a5) +; RV64I-NEXT: sw a0, 32(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, 16(a5) +; RV64I-NEXT: sw a0, 28(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) +; RV64I-NEXT: sw a0, 24(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) +; RV64I-NEXT: sw a0, 20(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) +; RV64I-NEXT: sw a0, 16(a2) +; RV64I-NEXT: sw a5, %lo(var_test_irq+12)(a4) +; RV64I-NEXT: sw a6, %lo(var_test_irq+8)(a4) +; RV64I-NEXT: sw a7, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll index b2dea4237f5a5..1c5b42f038b17 100644 --- a/llvm/test/CodeGen/RISCV/reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll @@ -8,24 +8,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) { ; RV32-LABEL: reduce_sum_4xi32: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 12(a0) +; RV32-NEXT: lw a1, 0(a0) ; RV32-NEXT: lw a2, 4(a0) -; RV32-NEXT: lw a3, 0(a0) -; RV32-NEXT: lw a0, 8(a0) -; RV32-NEXT: add a2, a3, a2 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a0, 12(a0) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: reduce_sum_4xi32: ; RV64: # %bb.0: -; RV64-NEXT: lw a1, 24(a0) +; RV64-NEXT: lw a1, 0(a0) ; RV64-NEXT: lw a2, 8(a0) -; RV64-NEXT: lw a3, 0(a0) -; RV64-NEXT: lw a0, 16(a0) -; RV64-NEXT: add a2, a3, a2 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: addw a0, a2, a0 +; RV64-NEXT: lw a3, 16(a0) +; RV64-NEXT: lw a0, 24(a0) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: add a0, a3, a0 +; RV64-NEXT: addw a0, a1, a0 ; RV64-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 @@ -40,24 +40,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) { define i32 @reduce_xor_4xi32(<4 x i32> %v) { ; RV32-LABEL: reduce_xor_4xi32: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 12(a0) +; RV32-NEXT: lw a1, 0(a0) ; RV32-NEXT: lw a2, 4(a0) -; RV32-NEXT: lw a3, 0(a0) -; RV32-NEXT: lw a0, 8(a0) -; RV32-NEXT: xor a2, a3, a2 -; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a0, a2, a0 +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a0, 12(a0) +; RV32-NEXT: xor a1, a1, a2 +; RV32-NEXT: xor a0, a3, a0 +; RV32-NEXT: xor a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: reduce_xor_4xi32: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 24(a0) +; RV64-NEXT: ld a1, 0(a0) ; RV64-NEXT: ld a2, 8(a0) -; RV64-NEXT: ld a3, 0(a0) -; RV64-NEXT: ld a0, 16(a0) -; RV64-NEXT: xor a2, a3, a2 -; RV64-NEXT: xor a0, a0, a1 -; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: ld a3, 16(a0) +; RV64-NEXT: ld a0, 24(a0) +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: xor a0, a3, a0 +; RV64-NEXT: xor a0, a1, a0 ; RV64-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 @@ -72,24 +72,24 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) { define i32 @reduce_or_4xi32(<4 x i32> %v) { ; RV32-LABEL: reduce_or_4xi32: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 12(a0) +; RV32-NEXT: lw a1, 0(a0) ; RV32-NEXT: lw a2, 4(a0) -; RV32-NEXT: lw a3, 0(a0) -; RV32-NEXT: lw a0, 8(a0) -; RV32-NEXT: or a2, a3, a2 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a0, 12(a0) +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: reduce_or_4xi32: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 24(a0) +; RV64-NEXT: ld a1, 0(a0) ; RV64-NEXT: ld a2, 8(a0) -; RV64-NEXT: ld a3, 0(a0) -; RV64-NEXT: ld a0, 16(a0) -; RV64-NEXT: or a2, a3, a2 -; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ld a3, 16(a0) +; RV64-NEXT: ld a0, 24(a0) +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: or a0, a3, a0 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 5f9ca503bcb05..5a91481572552 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -751,9 +751,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw a0, 4(a1) +; RV32I-NEXT: lw s5, 0(a1) ; RV32I-NEXT: lw s2, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw s6, 0(a1) +; RV32I-NEXT: lw s6, 12(a1) ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi s3, a2, 1365 @@ -775,9 +775,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli s8, a0, 24 -; RV32I-NEXT: srli a0, s6, 1 +; RV32I-NEXT: srli a0, s5, 1 ; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s6, a0 +; RV32I-NEXT: sub a0, s5, a0 ; RV32I-NEXT: and a1, a0, s4 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: and a0, a0, s4 @@ -789,9 +789,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add s8, a0, s8 -; RV32I-NEXT: srli a0, s5, 1 +; RV32I-NEXT: srli a0, s6, 1 ; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s5, a0 +; RV32I-NEXT: sub a0, s6, a0 ; RV32I-NEXT: and a1, a0, s4 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: and a0, a0, s4 @@ -858,21 +858,21 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_ult_two: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a1, 0(a0) -; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a2, 4(a0) ; RV32I-NEXT: lw a3, 8(a0) -; RV32I-NEXT: lw a0, 4(a0) -; RV32I-NEXT: addi a4, a1, -1 -; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: lw a4, 12(a0) +; RV32I-NEXT: addi a0, a1, -1 +; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: seqz a1, a1 -; RV32I-NEXT: sub a1, a0, a1 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: seqz a0, a0 ; RV32I-NEXT: addi a1, a3, -1 ; RV32I-NEXT: and a1, a3, a1 -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: seqz a2, a3 +; RV32I-NEXT: sub a2, a4, a2 +; RV32I-NEXT: and a2, a4, a2 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: seqz a1, a1 ; RV32I-NEXT: ret @@ -901,21 +901,21 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_ugt_one: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a1, 0(a0) -; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a2, 4(a0) ; RV32I-NEXT: lw a3, 8(a0) -; RV32I-NEXT: lw a0, 4(a0) -; RV32I-NEXT: addi a4, a1, -1 -; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: lw a4, 12(a0) +; RV32I-NEXT: addi a0, a1, -1 +; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: seqz a1, a1 -; RV32I-NEXT: sub a1, a0, a1 -; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: addi a1, a3, -1 ; RV32I-NEXT: and a1, a3, a1 -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: seqz a2, a3 +; RV32I-NEXT: sub a2, a4, a2 +; RV32I-NEXT: and a2, a4, a2 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: ret @@ -946,15 +946,15 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_eq_one: ; RV32I: # %bb.0: ; RV32I-NEXT: mv a1, a0 -; RV32I-NEXT: lw a2, 12(a0) -; RV32I-NEXT: lw a0, 4(a0) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: beqz a0, .LBB22_3 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a2, 12(a1) +; RV32I-NEXT: beqz a3, .LBB22_3 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a0, a3 -; RV32I-NEXT: xor a0, a0, a3 -; RV32I-NEXT: sltu a0, a3, a0 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: sub a0, a3, a0 +; RV32I-NEXT: xor a3, a3, a0 +; RV32I-NEXT: sltu a0, a0, a3 ; RV32I-NEXT: lw a1, 8(a1) ; RV32I-NEXT: bnez a2, .LBB22_4 ; RV32I-NEXT: .LBB22_2: @@ -963,9 +963,9 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; RV32I-NEXT: sltu a1, a2, a1 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB22_3: -; RV32I-NEXT: addi a0, a3, -1 -; RV32I-NEXT: xor a3, a3, a0 -; RV32I-NEXT: sltu a0, a0, a3 +; RV32I-NEXT: addi a3, a0, -1 +; RV32I-NEXT: xor a0, a0, a3 +; RV32I-NEXT: sltu a0, a3, a0 ; RV32I-NEXT: lw a1, 8(a1) ; RV32I-NEXT: beqz a2, .LBB22_2 ; RV32I-NEXT: .LBB22_4: @@ -1000,20 +1000,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64_ne_one: ; RV32I: # %bb.0: +; RV32I-NEXT: lw a2, 0(a0) +; RV32I-NEXT: lw a3, 4(a0) ; RV32I-NEXT: lw a1, 12(a0) -; RV32I-NEXT: lw a2, 4(a0) -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: beqz a2, .LBB23_2 +; RV32I-NEXT: beqz a3, .LBB23_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: seqz a3, a3 -; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: xor a2, a2, a3 -; RV32I-NEXT: sltu a2, a3, a2 -; RV32I-NEXT: j .LBB23_3 -; RV32I-NEXT: .LBB23_2: -; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: seqz a2, a2 +; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: xor a3, a3, a2 ; RV32I-NEXT: sltu a2, a2, a3 +; RV32I-NEXT: j .LBB23_3 +; RV32I-NEXT: .LBB23_2: +; RV32I-NEXT: addi a3, a2, -1 +; RV32I-NEXT: xor a2, a2, a3 +; RV32I-NEXT: sltu a2, a3, a2 ; RV32I-NEXT: .LBB23_3: ; RV32I-NEXT: lw a3, 8(a0) ; RV32I-NEXT: xori a0, a2, 1 diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll index 456a880891f73..3ecef05e96264 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll @@ -36,9 +36,9 @@ define void @lbu(ptr %a, ptr %b) nounwind { define void @lh(ptr %a, ptr %b) nounwind { ; RV64I-LABEL: lh: ; RV64I: # %bb.0: -; RV64I-NEXT: lh a2, 2(a0) ; RV64I-NEXT: lh zero, 0(a0) -; RV64I-NEXT: sw a2, 0(a1) +; RV64I-NEXT: lh a0, 2(a0) +; RV64I-NEXT: sw a0, 0(a1) ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 1 %2 = load i16, ptr %1 @@ -65,9 +65,9 @@ define void @lhu(ptr %a, ptr %b) nounwind { define void @lw(ptr %a, ptr %b) nounwind { ; RV64I-LABEL: lw: ; RV64I: # %bb.0: -; RV64I-NEXT: lw a2, 4(a0) ; RV64I-NEXT: lw zero, 0(a0) -; RV64I-NEXT: sd a2, 0(a1) +; RV64I-NEXT: lw a0, 4(a0) +; RV64I-NEXT: sd a0, 0(a1) ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i64 1 %2 = load i32, ptr %1 diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll index 76ab0e7d5810e..e2b885c59b70f 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll @@ -22,9 +22,8 @@ define dso_local i64 @lb(ptr %a) nounwind { define dso_local i64 @lh(ptr %a) nounwind { ; RV64I-LABEL: lh: ; RV64I: # %bb.0: -; RV64I-NEXT: lh a1, 4(a0) ; RV64I-NEXT: lh zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: lh a0, 4(a0) ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 2 %2 = load i16, ptr %1 @@ -37,9 +36,8 @@ define dso_local i64 @lh(ptr %a) nounwind { define dso_local i64 @lw(ptr %a) nounwind { ; RV64I-LABEL: lw: ; RV64I: # %bb.0: -; RV64I-NEXT: lw a1, 12(a0) ; RV64I-NEXT: lw zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: lw a0, 12(a0) ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 3 %2 = load i32, ptr %1 @@ -52,9 +50,9 @@ define dso_local i64 @lw(ptr %a) nounwind { define dso_local i64 @lbu(ptr %a) nounwind { ; RV64I-LABEL: lbu: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 4(a0) -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: lbu a0, 4(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i8, ptr %a, i32 4 %2 = load i8, ptr %1 @@ -68,9 +66,9 @@ define dso_local i64 @lbu(ptr %a) nounwind { define dso_local i64 @lhu(ptr %a) nounwind { ; RV64I-LABEL: lhu: ; RV64I: # %bb.0: -; RV64I-NEXT: lhu a1, 10(a0) -; RV64I-NEXT: lhu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 10(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 5 %2 = load i16, ptr %1 @@ -84,9 +82,9 @@ define dso_local i64 @lhu(ptr %a) nounwind { define dso_local i64 @lwu(ptr %a) nounwind { ; RV64I-LABEL: lwu: ; RV64I: # %bb.0: -; RV64I-NEXT: lwu a1, 24(a0) -; RV64I-NEXT: lwu a0, 0(a0) -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lwu a0, 24(a0) +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 6 %2 = load i32, ptr %1 @@ -102,9 +100,8 @@ define dso_local i64 @lwu(ptr %a) nounwind { define dso_local i64 @ld(ptr %a) nounwind { ; RV64I-LABEL: ld: ; RV64I: # %bb.0: -; RV64I-NEXT: ld a1, 80(a0) ; RV64I-NEXT: ld zero, 0(a0) -; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: ld a0, 80(a0) ; RV64I-NEXT: ret %1 = getelementptr i64, ptr %a, i32 10 %2 = load i64, ptr %1 @@ -128,10 +125,10 @@ define dso_local void @sd(ptr %a, i64 %b) nounwind { define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV64I-LABEL: load_sext_zext_anyext_i1: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: sub a0, a2, a1 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a0, 2(a0) +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 @@ -150,10 +147,10 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV64I-LABEL: load_sext_zext_anyext_i1_i16: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: subw a0, a2, a1 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a0, 2(a0) +; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll index f38aa71fb158d..6c4466796aeed 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll @@ -177,12 +177,12 @@ define i8 @test13(ptr %0, i64 %1) { ; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: subw a2, a2, a1 ; RV64I-NEXT: add a2, a0, a2 -; RV64I-NEXT: lbu a2, 0(a2) ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: subw a3, a3, a1 ; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: lbu a1, 0(a2) ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: ret %3 = mul i64 %1, -4294967296 %4 = add i64 %3, 4294967296 ; 1 << 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll index d34c10798f482..92b88054a1d3b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll @@ -8,14 +8,14 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV32-LABEL: vpreduce_add_v4i32: ; RV32: # %bb.0: -; RV32-NEXT: lw a4, 4(a1) -; RV32-NEXT: lw a5, 12(a1) +; RV32-NEXT: lw a4, 0(a1) +; RV32-NEXT: lw a5, 4(a1) ; RV32-NEXT: lw a6, 8(a1) -; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a1, 12(a1) ; RV32-NEXT: lw a7, 0(a2) -; RV32-NEXT: lw t0, 8(a2) -; RV32-NEXT: lw t1, 12(a2) -; RV32-NEXT: lw a2, 4(a2) +; RV32-NEXT: lw t0, 4(a2) +; RV32-NEXT: lw t1, 8(a2) +; RV32-NEXT: lw a2, 12(a2) ; RV32-NEXT: snez t2, a3 ; RV32-NEXT: sltiu t3, a3, 3 ; RV32-NEXT: xori t3, t3, 1 @@ -23,34 +23,34 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV32-NEXT: xori t4, t4, 1 ; RV32-NEXT: sltiu a3, a3, 2 ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: and a3, t4, t1 -; RV32-NEXT: and t0, t3, t0 +; RV32-NEXT: and a3, a3, t0 +; RV32-NEXT: and a2, t4, a2 +; RV32-NEXT: and t0, t3, t1 ; RV32-NEXT: and a7, t2, a7 ; RV32-NEXT: neg a7, a7 -; RV32-NEXT: and a1, a7, a1 +; RV32-NEXT: and a4, a7, a4 ; RV32-NEXT: neg a7, t0 ; RV32-NEXT: and a6, a7, a6 -; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a3, a3, a5 ; RV32-NEXT: neg a2, a2 -; RV32-NEXT: and a2, a2, a4 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a1, a1, a6 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: neg a2, a3 +; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: add a1, a4, a1 ; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vpreduce_add_v4i32: ; RV64: # %bb.0: -; RV64-NEXT: lw a4, 8(a1) -; RV64-NEXT: lw a5, 24(a1) +; RV64-NEXT: lw a4, 0(a1) +; RV64-NEXT: lw a5, 8(a1) ; RV64-NEXT: lw a6, 16(a1) -; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a1, 24(a1) ; RV64-NEXT: ld a7, 0(a2) -; RV64-NEXT: ld t0, 16(a2) -; RV64-NEXT: ld t1, 24(a2) -; RV64-NEXT: ld a2, 8(a2) +; RV64-NEXT: ld t0, 8(a2) +; RV64-NEXT: ld t1, 16(a2) +; RV64-NEXT: ld a2, 24(a2) ; RV64-NEXT: sext.w a3, a3 ; RV64-NEXT: snez t2, a3 ; RV64-NEXT: sltiu t3, a3, 3 @@ -59,21 +59,21 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV64-NEXT: xori t4, t4, 1 ; RV64-NEXT: sltiu a3, a3, 2 ; RV64-NEXT: xori a3, a3, 1 -; RV64-NEXT: and a2, a3, a2 -; RV64-NEXT: and a3, t4, t1 -; RV64-NEXT: and t0, t3, t0 +; RV64-NEXT: and a3, a3, t0 +; RV64-NEXT: and a2, t4, a2 +; RV64-NEXT: and t0, t3, t1 ; RV64-NEXT: and a7, t2, a7 ; RV64-NEXT: negw a7, a7 -; RV64-NEXT: and a1, a7, a1 +; RV64-NEXT: and a4, a7, a4 ; RV64-NEXT: negw a7, t0 ; RV64-NEXT: and a6, a7, a6 -; RV64-NEXT: negw a3, a3 -; RV64-NEXT: and a3, a3, a5 ; RV64-NEXT: negw a2, a2 -; RV64-NEXT: and a2, a2, a4 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a1, a1, a6 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: negw a2, a3 +; RV64-NEXT: and a2, a2, a5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: add a1, a4, a1 ; RV64-NEXT: addw a0, a1, a0 ; RV64-NEXT: ret %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll index 8ed19ddb1af5c..81e20a2988163 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll @@ -26,38 +26,38 @@ define void @add_v4i32(ptr %x, ptr %y) { define void @add_v2i64(ptr %x, ptr %y) { ; RV32-LABEL: add_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: lw a2, 8(a0) -; RV32-NEXT: lw a3, 12(a0) +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a3, 4(a1) ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: lw a6, 4(a1) -; RV32-NEXT: lw a7, 0(a1) +; RV32-NEXT: lw a6, 8(a0) +; RV32-NEXT: lw a7, 12(a0) ; RV32-NEXT: lw t0, 8(a1) ; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a7, a4, a7 -; RV32-NEXT: sltu a4, a7, a4 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add t0, a2, t0 -; RV32-NEXT: sltu a2, t0, a2 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a3, a5, a3 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: sltu a4, a2, a4 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: add a1, a7, a1 +; RV32-NEXT: add t0, a6, t0 +; RV32-NEXT: sltu a4, t0, a6 +; RV32-NEXT: add a1, a1, a4 ; RV32-NEXT: sw t0, 8(a0) -; RV32-NEXT: sw a7, 0(a0) +; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a1, 12(a0) -; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a3, 4(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a2, 8(a0) -; RV64-NEXT: ld a3, 0(a0) +; RV64-NEXT: ld a2, 0(a0) +; RV64-NEXT: ld a3, 8(a0) ; RV64-NEXT: ld a4, 0(a1) ; RV64-NEXT: ld a1, 8(a1) -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: sd a1, 8(a0) -; RV64-NEXT: sd a3, 0(a0) +; RV64-NEXT: sd a2, 0(a0) ; RV64-NEXT: ret %a = load <2 x i64>, ptr %x %b = load <2 x i64>, ptr %y @@ -134,14 +134,14 @@ define void @fadd_v4f32(ptr %x, ptr %y) { define void @fadd_v2f64(ptr %x, ptr %y) { ; CHECK-LABEL: fadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: fld fa5, 8(a0) -; CHECK-NEXT: fld fa4, 0(a0) +; CHECK-NEXT: fld fa5, 0(a0) +; CHECK-NEXT: fld fa4, 8(a0) ; CHECK-NEXT: fld fa3, 0(a1) ; CHECK-NEXT: fld fa2, 8(a1) -; CHECK-NEXT: fadd.d fa4, fa4, fa3 -; CHECK-NEXT: fadd.d fa5, fa5, fa2 -; CHECK-NEXT: fsd fa5, 8(a0) -; CHECK-NEXT: fsd fa4, 0(a0) +; CHECK-NEXT: fadd.d fa5, fa5, fa3 +; CHECK-NEXT: fadd.d fa4, fa4, fa2 +; CHECK-NEXT: fsd fa4, 8(a0) +; CHECK-NEXT: fsd fa5, 0(a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x double>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll index ed0b15c6add5c..c6208d1fc3cbc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll @@ -357,41 +357,41 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa3, v8 ; RV32-NEXT: feq.d a0, fa3, fa3 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: fld fa3, 32(sp) -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: fld fa3, 32(sp) +; RV32-NEXT: fld fa2, 40(sp) +; RV32-NEXT: fld fa1, 48(sp) +; RV32-NEXT: fld fa0, 56(sp) ; RV32-NEXT: feq.d a0, fa3, fa3 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: fld fa3, 40(sp) -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: fmax.d fa3, fa3, fa5 +; RV32-NEXT: feq.d a0, fa2, fa2 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fmax.d fa3, fa2, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: fld fa3, 48(sp) -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: fmax.d fa3, fa3, fa5 +; RV32-NEXT: feq.d a0, fa1, fa1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fmax.d fa3, fa1, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: fld fa3, 56(sp) -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: feq.d a0, fa3, fa3 +; RV32-NEXT: feq.d a0, fa0, fa0 ; RV32-NEXT: neg a0, a0 -; RV32-NEXT: fmax.d fa5, fa3, fa5 +; RV32-NEXT: fmax.d fa5, fa0, fa5 ; RV32-NEXT: fmin.d fa5, fa5, fa4 ; RV32-NEXT: fcvt.w.d a2, fa5, rtz ; RV32-NEXT: and a0, a0, a2 @@ -456,41 +456,41 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa3, v8 ; RV64-NEXT: feq.d a0, fa3, fa3 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: fld fa3, 32(sp) -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: fld fa3, 32(sp) +; RV64-NEXT: fld fa2, 40(sp) +; RV64-NEXT: fld fa1, 48(sp) +; RV64-NEXT: fld fa0, 56(sp) ; RV64-NEXT: feq.d a0, fa3, fa3 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: fld fa3, 40(sp) -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: fmax.d fa3, fa3, fa5 +; RV64-NEXT: feq.d a0, fa2, fa2 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: fmax.d fa3, fa2, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: fld fa3, 48(sp) -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: fmax.d fa3, fa3, fa5 +; RV64-NEXT: feq.d a0, fa1, fa1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: fmax.d fa3, fa1, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: fld fa3, 56(sp) -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: feq.d a0, fa3, fa3 +; RV64-NEXT: feq.d a0, fa0, fa0 ; RV64-NEXT: neg a0, a0 -; RV64-NEXT: fmax.d fa5, fa3, fa5 +; RV64-NEXT: fmax.d fa5, fa0, fa5 ; RV64-NEXT: fmin.d fa5, fa5, fa4 ; RV64-NEXT: fcvt.l.d a2, fa5, rtz ; RV64-NEXT: and a0, a0, a2 @@ -553,27 +553,27 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa4, v8 ; RV32-NEXT: fmax.d fa4, fa4, fa3 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: fld fa2, 32(sp) ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: fld fa4, 40(sp) -; RV32-NEXT: fmax.d fa2, fa2, fa3 -; RV32-NEXT: fmin.d fa2, fa2, fa5 -; RV32-NEXT: fcvt.wu.d a2, fa2, rtz +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: fld fa4, 32(sp) +; RV32-NEXT: fld fa2, 40(sp) +; RV32-NEXT: fld fa1, 48(sp) +; RV32-NEXT: fld fa0, 56(sp) ; RV32-NEXT: fmax.d fa4, fa4, fa3 -; RV32-NEXT: fld fa2, 48(sp) ; RV32-NEXT: fmin.d fa4, fa4, fa5 -; RV32-NEXT: fcvt.wu.d a3, fa4, rtz -; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: fcvt.wu.d a0, fa4, rtz +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: fmax.d fa4, fa2, fa3 ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: fld fa4, 56(sp) -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 ; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: fmax.d fa4, fa4, fa3 +; RV32-NEXT: fmax.d fa4, fa1, fa3 +; RV32-NEXT: fmin.d fa4, fa4, fa5 +; RV32-NEXT: fcvt.wu.d a0, fa4, rtz +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: fmax.d fa4, fa0, fa3 ; RV32-NEXT: fmin.d fa5, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa5, rtz ; RV32-NEXT: vslide1down.vx v8, v8, a0 @@ -627,27 +627,27 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa4, v8 ; RV64-NEXT: fmax.d fa4, fa4, fa3 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: fld fa2, 32(sp) ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: fld fa4, 40(sp) -; RV64-NEXT: fmax.d fa2, fa2, fa3 -; RV64-NEXT: fmin.d fa2, fa2, fa5 -; RV64-NEXT: fcvt.lu.d a2, fa2, rtz +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: fld fa4, 32(sp) +; RV64-NEXT: fld fa2, 40(sp) +; RV64-NEXT: fld fa1, 48(sp) +; RV64-NEXT: fld fa0, 56(sp) ; RV64-NEXT: fmax.d fa4, fa4, fa3 -; RV64-NEXT: fld fa2, 48(sp) ; RV64-NEXT: fmin.d fa4, fa4, fa5 -; RV64-NEXT: fcvt.lu.d a3, fa4, rtz -; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: fcvt.lu.d a0, fa4, rtz +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: fmax.d fa4, fa2, fa3 ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: fld fa4, 56(sp) -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 ; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: fmax.d fa4, fa4, fa3 +; RV64-NEXT: fmax.d fa4, fa1, fa3 +; RV64-NEXT: fmin.d fa4, fa4, fa5 +; RV64-NEXT: fcvt.lu.d a0, fa4, rtz +; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: fmax.d fa4, fa0, fa3 ; RV64-NEXT: fmin.d fa5, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa5, rtz ; RV64-NEXT: vslide1down.vx v8, v8, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index ec11ada12eaa7..861ff459e87f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -134,12 +134,12 @@ define <3 x float> @si2fp_v3i1_v3f32(<3 x i1> %x) { define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; LMULMAX8RV32-LABEL: si2fp_v3i7_v3f32: ; LMULMAX8RV32: # %bb.0: -; LMULMAX8RV32-NEXT: lw a1, 4(a0) -; LMULMAX8RV32-NEXT: lw a2, 0(a0) -; LMULMAX8RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32-NEXT: lw a1, 0(a0) +; LMULMAX8RV32-NEXT: lw a2, 4(a0) ; LMULMAX8RV32-NEXT: lw a0, 8(a0) -; LMULMAX8RV32-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV32-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV32-NEXT: vadd.vv v8, v8, v8 @@ -151,12 +151,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV64-LABEL: si2fp_v3i7_v3f32: ; LMULMAX8RV64: # %bb.0: -; LMULMAX8RV64-NEXT: ld a1, 8(a0) -; LMULMAX8RV64-NEXT: ld a2, 0(a0) -; LMULMAX8RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64-NEXT: ld a1, 0(a0) +; LMULMAX8RV64-NEXT: ld a2, 8(a0) ; LMULMAX8RV64-NEXT: ld a0, 16(a0) -; LMULMAX8RV64-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV64-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV64-NEXT: vadd.vv v8, v8, v8 @@ -168,12 +168,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX1RV32-LABEL: si2fp_v3i7_v3f32: ; LMULMAX1RV32: # %bb.0: -; LMULMAX1RV32-NEXT: lw a1, 4(a0) -; LMULMAX1RV32-NEXT: lw a2, 0(a0) -; LMULMAX1RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV32-NEXT: lw a1, 0(a0) +; LMULMAX1RV32-NEXT: lw a2, 4(a0) ; LMULMAX1RV32-NEXT: lw a0, 8(a0) -; LMULMAX1RV32-NEXT: vmv.v.x v8, a2 -; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX1RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX1RV32-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX1RV32-NEXT: vadd.vv v8, v8, v8 @@ -185,12 +185,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX1RV64-LABEL: si2fp_v3i7_v3f32: ; LMULMAX1RV64: # %bb.0: -; LMULMAX1RV64-NEXT: ld a1, 8(a0) -; LMULMAX1RV64-NEXT: ld a2, 0(a0) -; LMULMAX1RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV64-NEXT: ld a1, 0(a0) +; LMULMAX1RV64-NEXT: ld a2, 8(a0) ; LMULMAX1RV64-NEXT: ld a0, 16(a0) -; LMULMAX1RV64-NEXT: vmv.v.x v8, a2 -; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX1RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV64-NEXT: vmv.v.x v8, a1 +; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX1RV64-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX1RV64-NEXT: vadd.vv v8, v8, v8 @@ -202,12 +202,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV32ZVFHMIN-LABEL: si2fp_v3i7_v3f32: ; LMULMAX8RV32ZVFHMIN: # %bb.0: -; LMULMAX8RV32ZVFHMIN-NEXT: lw a1, 4(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: lw a2, 0(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32ZVFHMIN-NEXT: lw a1, 0(a0) +; LMULMAX8RV32ZVFHMIN-NEXT: lw a2, 4(a0) ; LMULMAX8RV32ZVFHMIN-NEXT: lw a0, 8(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV32ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32ZVFHMIN-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV32ZVFHMIN-NEXT: vadd.vv v8, v8, v8 @@ -219,12 +219,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV64ZVFHMIN-LABEL: si2fp_v3i7_v3f32: ; LMULMAX8RV64ZVFHMIN: # %bb.0: -; LMULMAX8RV64ZVFHMIN-NEXT: ld a1, 8(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: ld a2, 0(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64ZVFHMIN-NEXT: ld a1, 0(a0) +; LMULMAX8RV64ZVFHMIN-NEXT: ld a2, 8(a0) ; LMULMAX8RV64ZVFHMIN-NEXT: ld a0, 16(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV64ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64ZVFHMIN-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV64ZVFHMIN-NEXT: vadd.vv v8, v8, v8 @@ -241,12 +241,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) { define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; LMULMAX8RV32-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX8RV32: # %bb.0: -; LMULMAX8RV32-NEXT: lw a1, 4(a0) -; LMULMAX8RV32-NEXT: lw a2, 0(a0) -; LMULMAX8RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32-NEXT: lw a1, 0(a0) +; LMULMAX8RV32-NEXT: lw a2, 4(a0) ; LMULMAX8RV32-NEXT: lw a0, 8(a0) -; LMULMAX8RV32-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV32-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV32-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV32-NEXT: li a0, 127 @@ -258,12 +258,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV64-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX8RV64: # %bb.0: -; LMULMAX8RV64-NEXT: ld a1, 8(a0) -; LMULMAX8RV64-NEXT: ld a2, 0(a0) -; LMULMAX8RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64-NEXT: ld a1, 0(a0) +; LMULMAX8RV64-NEXT: ld a2, 8(a0) ; LMULMAX8RV64-NEXT: ld a0, 16(a0) -; LMULMAX8RV64-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV64-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV64-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV64-NEXT: li a0, 127 @@ -275,12 +275,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX1RV32-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX1RV32: # %bb.0: -; LMULMAX1RV32-NEXT: lw a1, 4(a0) -; LMULMAX1RV32-NEXT: lw a2, 0(a0) -; LMULMAX1RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV32-NEXT: lw a1, 0(a0) +; LMULMAX1RV32-NEXT: lw a2, 4(a0) ; LMULMAX1RV32-NEXT: lw a0, 8(a0) -; LMULMAX1RV32-NEXT: vmv.v.x v8, a2 -; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX1RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX1RV32-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX1RV32-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX1RV32-NEXT: li a0, 127 @@ -292,12 +292,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX1RV64-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX1RV64: # %bb.0: -; LMULMAX1RV64-NEXT: ld a1, 8(a0) -; LMULMAX1RV64-NEXT: ld a2, 0(a0) -; LMULMAX1RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV64-NEXT: ld a1, 0(a0) +; LMULMAX1RV64-NEXT: ld a2, 8(a0) ; LMULMAX1RV64-NEXT: ld a0, 16(a0) -; LMULMAX1RV64-NEXT: vmv.v.x v8, a2 -; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX1RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1RV64-NEXT: vmv.v.x v8, a1 +; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX1RV64-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX1RV64-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX1RV64-NEXT: li a0, 127 @@ -309,12 +309,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV32ZVFHMIN-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX8RV32ZVFHMIN: # %bb.0: -; LMULMAX8RV32ZVFHMIN-NEXT: lw a1, 4(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: lw a2, 0(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32ZVFHMIN-NEXT: lw a1, 0(a0) +; LMULMAX8RV32ZVFHMIN-NEXT: lw a2, 4(a0) ; LMULMAX8RV32ZVFHMIN-NEXT: lw a0, 8(a0) -; LMULMAX8RV32ZVFHMIN-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV32ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV32ZVFHMIN-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV32ZVFHMIN-NEXT: li a0, 127 @@ -326,12 +326,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) { ; ; LMULMAX8RV64ZVFHMIN-LABEL: ui2fp_v3i7_v3f32: ; LMULMAX8RV64ZVFHMIN: # %bb.0: -; LMULMAX8RV64ZVFHMIN-NEXT: ld a1, 8(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: ld a2, 0(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64ZVFHMIN-NEXT: ld a1, 0(a0) +; LMULMAX8RV64ZVFHMIN-NEXT: ld a2, 8(a0) ; LMULMAX8RV64ZVFHMIN-NEXT: ld a0, 16(a0) -; LMULMAX8RV64ZVFHMIN-NEXT: vmv.v.x v8, a2 -; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 +; LMULMAX8RV64ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX8RV64ZVFHMIN-NEXT: vmv.v.x v8, a1 +; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 ; LMULMAX8RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 ; LMULMAX8RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; LMULMAX8RV64ZVFHMIN-NEXT: li a0, 127 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll index 8acc70faaa1fc..432c49514bb65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll @@ -7,25 +7,25 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-LABEL: load_large_vector: ; ZVE32X: # %bb.0: -; ZVE32X-NEXT: ld a1, 80(a0) -; ZVE32X-NEXT: ld a2, 72(a0) -; ZVE32X-NEXT: ld a3, 56(a0) +; ZVE32X-NEXT: ld a1, 0(a0) +; ZVE32X-NEXT: ld a2, 8(a0) +; ZVE32X-NEXT: ld a3, 24(a0) ; ZVE32X-NEXT: ld a4, 32(a0) -; ZVE32X-NEXT: ld a5, 24(a0) -; ZVE32X-NEXT: ld a6, 48(a0) -; ZVE32X-NEXT: ld a7, 8(a0) -; ZVE32X-NEXT: ld a0, 0(a0) -; ZVE32X-NEXT: xor a4, a5, a4 -; ZVE32X-NEXT: snez a4, a4 +; ZVE32X-NEXT: ld a5, 48(a0) +; ZVE32X-NEXT: ld a6, 56(a0) +; ZVE32X-NEXT: ld a7, 72(a0) +; ZVE32X-NEXT: ld a0, 80(a0) ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; ZVE32X-NEXT: vmv.s.x v8, a4 +; ZVE32X-NEXT: xor a3, a3, a4 +; ZVE32X-NEXT: snez a3, a3 +; ZVE32X-NEXT: vmv.s.x v8, a3 ; ZVE32X-NEXT: vand.vi v8, v8, 1 ; ZVE32X-NEXT: vmsne.vi v0, v8, 0 ; ZVE32X-NEXT: vmv.s.x v8, zero ; ZVE32X-NEXT: vmerge.vim v9, v8, 1, v0 -; ZVE32X-NEXT: xor a0, a0, a7 -; ZVE32X-NEXT: snez a0, a0 -; ZVE32X-NEXT: vmv.s.x v10, a0 +; ZVE32X-NEXT: xor a1, a1, a2 +; ZVE32X-NEXT: snez a1, a1 +; ZVE32X-NEXT: vmv.s.x v10, a1 ; ZVE32X-NEXT: vand.vi v10, v10, 1 ; ZVE32X-NEXT: vmsne.vi v0, v10, 0 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma @@ -36,9 +36,9 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32X-NEXT: vmsne.vi v0, v11, 0 ; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0 -; ZVE32X-NEXT: xor a0, a6, a3 -; ZVE32X-NEXT: snez a0, a0 -; ZVE32X-NEXT: vmv.s.x v11, a0 +; ZVE32X-NEXT: xor a1, a5, a6 +; ZVE32X-NEXT: snez a1, a1 +; ZVE32X-NEXT: vmv.s.x v11, a1 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v11, v11, 1 ; ZVE32X-NEXT: vmsne.vi v0, v11, 0 @@ -48,8 +48,8 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32X-NEXT: vmsne.vi v0, v9, 0 ; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0 -; ZVE32X-NEXT: xor a1, a2, a1 -; ZVE32X-NEXT: snez a0, a1 +; ZVE32X-NEXT: xor a0, a7, a0 +; ZVE32X-NEXT: snez a0, a0 ; ZVE32X-NEXT: vmv.s.x v10, a0 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v10, v10, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index 7c5047bbdf635..db058fc5c102e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -453,19 +453,25 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>) define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-LABEL: llrint_v16i64_v16f32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -400 -; RV32-NEXT: .cfi_def_cfa_offset 400 -; RV32-NEXT: sw ra, 396(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 392(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -384 +; RV32-NEXT: .cfi_def_cfa_offset 384 +; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill +; RV32-NEXT: fsd fs0, 368(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 360(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs2, 352(sp) # 8-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 400 +; RV32-NEXT: .cfi_offset fs0, -16 +; RV32-NEXT: .cfi_offset fs1, -24 +; RV32-NEXT: .cfi_offset fs2, -32 +; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 2 ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -128 -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma @@ -487,30 +493,33 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 228(sp) ; RV32-NEXT: sw a0, 224(sp) ; RV32-NEXT: flw fa0, 108(sp) +; RV32-NEXT: flw fs0, 96(sp) +; RV32-NEXT: flw fs1, 100(sp) +; RV32-NEXT: flw fs2, 104(sp) ; RV32-NEXT: call llrintf@plt ; RV32-NEXT: sw a1, 220(sp) ; RV32-NEXT: sw a0, 216(sp) -; RV32-NEXT: flw fa0, 104(sp) +; RV32-NEXT: fmv.s fa0, fs2 ; RV32-NEXT: call llrintf@plt ; RV32-NEXT: sw a1, 212(sp) ; RV32-NEXT: sw a0, 208(sp) -; RV32-NEXT: flw fa0, 100(sp) +; RV32-NEXT: fmv.s fa0, fs1 ; RV32-NEXT: call llrintf@plt ; RV32-NEXT: sw a1, 204(sp) ; RV32-NEXT: sw a0, 200(sp) -; RV32-NEXT: flw fa0, 96(sp) +; RV32-NEXT: fmv.s fa0, fs0 ; RV32-NEXT: call llrintf@plt ; RV32-NEXT: sw a1, 196(sp) ; RV32-NEXT: sw a0, 192(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: call llrintf@plt ; RV32-NEXT: sw a1, 132(sp) ; RV32-NEXT: sw a0, 128(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -518,7 +527,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 156(sp) ; RV32-NEXT: sw a0, 152(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 2 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -526,7 +535,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 148(sp) ; RV32-NEXT: sw a0, 144(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -534,7 +543,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 140(sp) ; RV32-NEXT: sw a0, 136(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 7 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -542,7 +551,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 188(sp) ; RV32-NEXT: sw a0, 184(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 6 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -550,7 +559,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 180(sp) ; RV32-NEXT: sw a0, 176(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 5 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -558,7 +567,7 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: sw a1, 172(sp) ; RV32-NEXT: sw a0, 168(sp) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: addi a0, sp, 384 +; RV32-NEXT: addi a0, sp, 352 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 4 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -569,10 +578,13 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV32-NEXT: addi a1, sp, 128 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: addi sp, s0, -400 -; RV32-NEXT: lw ra, 396(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 392(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 400 +; RV32-NEXT: addi sp, s0, -384 +; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32-NEXT: fld fs0, 368(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 360(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs2, 352(sp) # 8-byte Folded Reload +; RV32-NEXT: addi sp, sp, 384 ; RV32-NEXT: ret ; ; RV64-LABEL: llrint_v16i64_v16f32: @@ -602,16 +614,16 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: sd a0, 224(sp) ; RV64-NEXT: flw fa5, 108(sp) +; RV64-NEXT: flw fa4, 104(sp) +; RV64-NEXT: flw fa3, 96(sp) +; RV64-NEXT: flw fa2, 100(sp) ; RV64-NEXT: fcvt.l.s a0, fa5 ; RV64-NEXT: sd a0, 216(sp) -; RV64-NEXT: flw fa5, 104(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: fcvt.l.s a0, fa4 ; RV64-NEXT: sd a0, 208(sp) -; RV64-NEXT: flw fa5, 100(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: fcvt.l.s a0, fa2 ; RV64-NEXT: sd a0, 200(sp) -; RV64-NEXT: flw fa5, 96(sp) -; RV64-NEXT: fcvt.l.s a0, fa5 +; RV64-NEXT: fcvt.l.s a0, fa3 ; RV64-NEXT: sd a0, 192(sp) ; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: fcvt.l.s a0, fa5 @@ -857,48 +869,57 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>) define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV32-LABEL: llrint_v8i64_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -272 -; RV32-NEXT: .cfi_def_cfa_offset 272 -; RV32-NEXT: sw ra, 268(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 264(sp) # 4-byte Folded Spill +; RV32-NEXT: addi sp, sp, -256 +; RV32-NEXT: .cfi_def_cfa_offset 256 +; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: fsd fs0, 240(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 232(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs2, 224(sp) # 8-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 272 +; RV32-NEXT: .cfi_offset fs0, -16 +; RV32-NEXT: .cfi_offset fs1, -24 +; RV32-NEXT: .cfi_offset fs2, -32 +; RV32-NEXT: addi s0, sp, 256 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 2 ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: addi a0, sp, 256 +; RV32-NEXT: addi a0, sp, 224 ; RV32-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: fld fa0, 120(sp) +; RV32-NEXT: fld fs0, 96(sp) +; RV32-NEXT: fld fs1, 104(sp) +; RV32-NEXT: fld fs2, 112(sp) ; RV32-NEXT: call llrint@plt ; RV32-NEXT: sw a1, 188(sp) ; RV32-NEXT: sw a0, 184(sp) -; RV32-NEXT: fld fa0, 112(sp) +; RV32-NEXT: fmv.d fa0, fs2 ; RV32-NEXT: call llrint@plt ; RV32-NEXT: sw a1, 180(sp) ; RV32-NEXT: sw a0, 176(sp) -; RV32-NEXT: fld fa0, 104(sp) +; RV32-NEXT: fmv.d fa0, fs1 ; RV32-NEXT: call llrint@plt ; RV32-NEXT: sw a1, 172(sp) ; RV32-NEXT: sw a0, 168(sp) -; RV32-NEXT: fld fa0, 96(sp) +; RV32-NEXT: fmv.d fa0, fs0 ; RV32-NEXT: call llrint@plt ; RV32-NEXT: sw a1, 164(sp) ; RV32-NEXT: sw a0, 160(sp) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: addi a0, sp, 256 +; RV32-NEXT: addi a0, sp, 224 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: call llrint@plt ; RV32-NEXT: sw a1, 132(sp) ; RV32-NEXT: sw a0, 128(sp) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: addi a0, sp, 256 +; RV32-NEXT: addi a0, sp, 224 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 1 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -906,7 +927,7 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV32-NEXT: sw a1, 140(sp) ; RV32-NEXT: sw a0, 136(sp) ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: addi a0, sp, 256 +; RV32-NEXT: addi a0, sp, 224 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -914,7 +935,7 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV32-NEXT: sw a1, 156(sp) ; RV32-NEXT: sw a0, 152(sp) ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: addi a0, sp, 256 +; RV32-NEXT: addi a0, sp, 224 ; RV32-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vslidedown.vi v8, v8, 2 ; RV32-NEXT: vfmv.f.s fa0, v8 @@ -924,10 +945,13 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV32-NEXT: addi a0, sp, 128 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi sp, s0, -272 -; RV32-NEXT: lw ra, 268(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 264(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 272 +; RV32-NEXT: addi sp, s0, -256 +; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32-NEXT: fld fs0, 240(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 232(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs2, 224(sp) # 8-byte Folded Reload +; RV32-NEXT: addi sp, sp, 256 ; RV32-NEXT: ret ; ; RV64-LABEL: llrint_v8i64_v8f64: @@ -945,16 +969,16 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: fld fa5, 56(sp) +; RV64-NEXT: fld fa4, 48(sp) +; RV64-NEXT: fld fa3, 32(sp) +; RV64-NEXT: fld fa2, 40(sp) ; RV64-NEXT: fcvt.l.d a0, fa5 ; RV64-NEXT: sd a0, 120(sp) -; RV64-NEXT: fld fa5, 48(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: fcvt.l.d a0, fa4 ; RV64-NEXT: sd a0, 112(sp) -; RV64-NEXT: fld fa5, 40(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: fcvt.l.d a0, fa2 ; RV64-NEXT: sd a0, 104(sp) -; RV64-NEXT: fld fa5, 32(sp) -; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: fcvt.l.d a0, fa3 ; RV64-NEXT: sd a0, 96(sp) ; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: fcvt.l.d a0, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll index 224f5066138cd..3ad2b862e17e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll @@ -574,20 +574,20 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa5, v8 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: fld fa4, 32(sp) -; RV32-NEXT: fld fa3, 40(sp) ; RV32-NEXT: fcvt.w.d a0, fa5 -; RV32-NEXT: fld fa5, 48(sp) -; RV32-NEXT: fcvt.w.d a1, fa4 -; RV32-NEXT: fcvt.w.d a2, fa3 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: fld fa5, 32(sp) +; RV32-NEXT: fld fa4, 40(sp) +; RV32-NEXT: fld fa3, 48(sp) +; RV32-NEXT: fld fa2, 56(sp) ; RV32-NEXT: fcvt.w.d a0, fa5 -; RV32-NEXT: fld fa5, 56(sp) -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: fcvt.w.d a0, fa5 +; RV32-NEXT: fcvt.w.d a0, fa4 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: fcvt.w.d a0, fa3 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: fcvt.w.d a0, fa2 ; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: addi sp, s0, -128 ; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload @@ -627,20 +627,20 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i32-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-i32-NEXT: vslidedown.vi v8, v8, 3 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 -; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-i32-NEXT: fld fa4, 32(sp) -; RV64-i32-NEXT: fld fa3, 40(sp) ; RV64-i32-NEXT: fcvt.l.d a0, fa5 -; RV64-i32-NEXT: fld fa5, 48(sp) -; RV64-i32-NEXT: fcvt.l.d a1, fa4 -; RV64-i32-NEXT: fcvt.l.d a2, fa3 +; RV64-i32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-i32-NEXT: vslide1down.vx v8, v10, a0 +; RV64-i32-NEXT: fld fa5, 32(sp) +; RV64-i32-NEXT: fld fa4, 40(sp) +; RV64-i32-NEXT: fld fa3, 48(sp) +; RV64-i32-NEXT: fld fa2, 56(sp) ; RV64-i32-NEXT: fcvt.l.d a0, fa5 -; RV64-i32-NEXT: fld fa5, 56(sp) -; RV64-i32-NEXT: vslide1down.vx v8, v8, a1 -; RV64-i32-NEXT: vslide1down.vx v8, v8, a2 ; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 -; RV64-i32-NEXT: fcvt.l.d a0, fa5 +; RV64-i32-NEXT: fcvt.l.d a0, fa4 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: fcvt.l.d a0, fa3 +; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 +; RV64-i32-NEXT: fcvt.l.d a0, fa2 ; RV64-i32-NEXT: vslide1down.vx v8, v8, a0 ; RV64-i32-NEXT: addi sp, s0, -128 ; RV64-i32-NEXT: ld ra, 120(sp) # 8-byte Folded Reload @@ -663,16 +663,16 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-i64-NEXT: vse64.v v8, (a0) ; RV64-i64-NEXT: fld fa5, 56(sp) +; RV64-i64-NEXT: fld fa4, 48(sp) +; RV64-i64-NEXT: fld fa3, 32(sp) +; RV64-i64-NEXT: fld fa2, 40(sp) ; RV64-i64-NEXT: fcvt.l.d a0, fa5 ; RV64-i64-NEXT: sd a0, 120(sp) -; RV64-i64-NEXT: fld fa5, 48(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: fcvt.l.d a0, fa4 ; RV64-i64-NEXT: sd a0, 112(sp) -; RV64-i64-NEXT: fld fa5, 40(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: fcvt.l.d a0, fa2 ; RV64-i64-NEXT: sd a0, 104(sp) -; RV64-i64-NEXT: fld fa5, 32(sp) -; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: fcvt.l.d a0, fa3 ; RV64-i64-NEXT: sd a0, 96(sp) ; RV64-i64-NEXT: vfmv.f.s fa5, v8 ; RV64-i64-NEXT: fcvt.l.d a0, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index ac3bf0d89b5ed..6e8470af27a86 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -3599,9 +3599,9 @@ define <1 x i64> @mgather_v1i64(<1 x ptr> %ptrs, <1 x i1> %m, <1 x i64> %passthr ; RV32ZVE32F-NEXT: bnez a2, .LBB42_2 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a0) -; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a0, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: .LBB42_2: # %else ; RV32ZVE32F-NEXT: ret ; @@ -3645,30 +3645,30 @@ define <2 x i64> @mgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthr ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a2, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, a4, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB43_4 ; RV32ZVE32F-NEXT: .LBB43_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a1, 8(a1) +; RV32ZVE32F-NEXT: lw a4, 8(a1) +; RV32ZVE32F-NEXT: lw a1, 12(a1) ; RV32ZVE32F-NEXT: j .LBB43_5 ; RV32ZVE32F-NEXT: .LBB43_3: -; RV32ZVE32F-NEXT: lw a2, 4(a1) -; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 4(a1) ; RV32ZVE32F-NEXT: andi a4, a4, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB43_2 ; RV32ZVE32F-NEXT: .LBB43_4: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a4, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: lw a4, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: .LBB43_5: # %else2 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a1, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a1, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i64: @@ -3718,60 +3718,60 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a2, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB44_6 ; RV32ZVE32F-NEXT: .LBB44_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: lw a4, 8(a1) +; RV32ZVE32F-NEXT: lw a5, 12(a1) ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: bnez a7, .LBB44_7 ; RV32ZVE32F-NEXT: .LBB44_3: -; RV32ZVE32F-NEXT: lw a7, 20(a1) -; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: lw a7, 16(a1) +; RV32ZVE32F-NEXT: lw t0, 20(a1) ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: bnez a6, .LBB44_8 ; RV32ZVE32F-NEXT: .LBB44_4: -; RV32ZVE32F-NEXT: lw a6, 28(a1) -; RV32ZVE32F-NEXT: lw a1, 24(a1) +; RV32ZVE32F-NEXT: lw a6, 24(a1) +; RV32ZVE32F-NEXT: lw a1, 28(a1) ; RV32ZVE32F-NEXT: j .LBB44_9 ; RV32ZVE32F-NEXT: .LBB44_5: -; RV32ZVE32F-NEXT: lw a2, 4(a1) -; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 4(a1) ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB44_2 ; RV32ZVE32F-NEXT: .LBB44_6: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB44_3 ; RV32ZVE32F-NEXT: .LBB44_7: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 -; RV32ZVE32F-NEXT: lw a7, 4(t0) -; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: lw a7, 0(t0) +; RV32ZVE32F-NEXT: lw t0, 4(t0) ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: beqz a6, .LBB44_4 ; RV32ZVE32F-NEXT: .LBB44_8: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a6, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: lw a6, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: .LBB44_9: # %else8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw t0, 16(a0) -; RV32ZVE32F-NEXT: sw a7, 20(a0) -; RV32ZVE32F-NEXT: sw a1, 24(a0) -; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw t0, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 24(a0) +; RV32ZVE32F-NEXT: sw a1, 28(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v4i64: @@ -3846,60 +3846,60 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) { ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a2, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB45_6 ; RV32ZVE32F-NEXT: .LBB45_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: lw a4, 8(a1) +; RV32ZVE32F-NEXT: lw a5, 12(a1) ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: bnez a7, .LBB45_7 ; RV32ZVE32F-NEXT: .LBB45_3: -; RV32ZVE32F-NEXT: lw a7, 20(a1) -; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: lw a7, 16(a1) +; RV32ZVE32F-NEXT: lw t0, 20(a1) ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: bnez a6, .LBB45_8 ; RV32ZVE32F-NEXT: .LBB45_4: -; RV32ZVE32F-NEXT: lw a6, 28(a1) -; RV32ZVE32F-NEXT: lw a1, 24(a1) +; RV32ZVE32F-NEXT: lw a6, 24(a1) +; RV32ZVE32F-NEXT: lw a1, 28(a1) ; RV32ZVE32F-NEXT: j .LBB45_9 ; RV32ZVE32F-NEXT: .LBB45_5: -; RV32ZVE32F-NEXT: lw a2, 4(a1) -; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 4(a1) ; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB45_2 ; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a7, a6, 4 ; RV32ZVE32F-NEXT: beqz a7, .LBB45_3 ; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s t0, v9 -; RV32ZVE32F-NEXT: lw a7, 4(t0) -; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: lw a7, 0(t0) +; RV32ZVE32F-NEXT: lw t0, 4(t0) ; RV32ZVE32F-NEXT: andi a6, a6, 8 ; RV32ZVE32F-NEXT: beqz a6, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw a6, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: lw a6, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: .LBB45_9: # %else8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw t0, 16(a0) -; RV32ZVE32F-NEXT: sw a7, 20(a0) -; RV32ZVE32F-NEXT: sw a1, 24(a0) -; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw t0, 20(a0) +; RV32ZVE32F-NEXT: sw a6, 24(a0) +; RV32ZVE32F-NEXT: sw a1, 28(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i64: @@ -3966,22 +3966,22 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) ; ; RV32ZVE32F-LABEL: mgather_falsemask_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a2, 0(a1) -; RV32ZVE32F-NEXT: lw a3, 4(a1) -; RV32ZVE32F-NEXT: lw a4, 8(a1) -; RV32ZVE32F-NEXT: lw a5, 12(a1) -; RV32ZVE32F-NEXT: lw a6, 28(a1) -; RV32ZVE32F-NEXT: lw a7, 24(a1) -; RV32ZVE32F-NEXT: lw t0, 20(a1) +; RV32ZVE32F-NEXT: lw a2, 20(a1) +; RV32ZVE32F-NEXT: lw a3, 24(a1) +; RV32ZVE32F-NEXT: lw a4, 28(a1) +; RV32ZVE32F-NEXT: lw a5, 0(a1) +; RV32ZVE32F-NEXT: lw a6, 4(a1) +; RV32ZVE32F-NEXT: lw a7, 8(a1) +; RV32ZVE32F-NEXT: lw t0, 12(a1) ; RV32ZVE32F-NEXT: lw a1, 16(a1) -; RV32ZVE32F-NEXT: sw a6, 28(a0) -; RV32ZVE32F-NEXT: sw a7, 24(a0) -; RV32ZVE32F-NEXT: sw t0, 20(a0) +; RV32ZVE32F-NEXT: sw a4, 28(a0) +; RV32ZVE32F-NEXT: sw a3, 24(a0) +; RV32ZVE32F-NEXT: sw a2, 20(a0) ; RV32ZVE32F-NEXT: sw a1, 16(a0) -; RV32ZVE32F-NEXT: sw a5, 12(a0) -; RV32ZVE32F-NEXT: sw a4, 8(a0) -; RV32ZVE32F-NEXT: sw a3, 4(a0) -; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 8(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 0(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_falsemask_v4i64: @@ -4025,77 +4025,77 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a2, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB47_8 ; RV32ZVE32F-NEXT: .LBB47_2: -; RV32ZVE32F-NEXT: lw a4, 12(a1) -; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: lw a4, 8(a1) +; RV32ZVE32F-NEXT: lw a5, 12(a1) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB47_9 ; RV32ZVE32F-NEXT: .LBB47_3: -; RV32ZVE32F-NEXT: lw a6, 20(a1) -; RV32ZVE32F-NEXT: lw a7, 16(a1) +; RV32ZVE32F-NEXT: lw a6, 16(a1) +; RV32ZVE32F-NEXT: lw a7, 20(a1) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB47_10 ; RV32ZVE32F-NEXT: .LBB47_4: -; RV32ZVE32F-NEXT: lw t1, 28(a1) -; RV32ZVE32F-NEXT: lw t2, 24(a1) +; RV32ZVE32F-NEXT: lw t1, 24(a1) +; RV32ZVE32F-NEXT: lw t2, 28(a1) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB47_11 ; RV32ZVE32F-NEXT: .LBB47_5: -; RV32ZVE32F-NEXT: lw t3, 36(a1) -; RV32ZVE32F-NEXT: lw t4, 32(a1) +; RV32ZVE32F-NEXT: lw t3, 32(a1) +; RV32ZVE32F-NEXT: lw t4, 36(a1) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB47_12 ; RV32ZVE32F-NEXT: .LBB47_6: -; RV32ZVE32F-NEXT: lw t5, 44(a1) -; RV32ZVE32F-NEXT: lw t6, 40(a1) +; RV32ZVE32F-NEXT: lw t5, 40(a1) +; RV32ZVE32F-NEXT: lw t6, 44(a1) ; RV32ZVE32F-NEXT: j .LBB47_13 ; RV32ZVE32F-NEXT: .LBB47_7: -; RV32ZVE32F-NEXT: lw a2, 4(a1) -; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 4(a1) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB47_2 ; RV32ZVE32F-NEXT: .LBB47_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB47_3 ; RV32ZVE32F-NEXT: .LBB47_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB47_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -4109,42 +4109,42 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB47_17 ; RV32ZVE32F-NEXT: .LBB47_15: -; RV32ZVE32F-NEXT: lw t0, 60(a1) -; RV32ZVE32F-NEXT: lw a1, 56(a1) +; RV32ZVE32F-NEXT: lw t0, 56(a1) +; RV32ZVE32F-NEXT: lw a1, 60(a1) ; RV32ZVE32F-NEXT: j .LBB47_18 ; RV32ZVE32F-NEXT: .LBB47_16: -; RV32ZVE32F-NEXT: lw s0, 52(a1) -; RV32ZVE32F-NEXT: lw s1, 48(a1) +; RV32ZVE32F-NEXT: lw s0, 48(a1) +; RV32ZVE32F-NEXT: lw s1, 52(a1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB47_15 ; RV32ZVE32F-NEXT: .LBB47_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a1) -; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: lw t0, 0(a1) +; RV32ZVE32F-NEXT: lw a1, 4(a1) ; RV32ZVE32F-NEXT: .LBB47_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a1, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a1, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4272,77 +4272,77 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB48_8 ; RV32ZVE32F-NEXT: .LBB48_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB48_9 ; RV32ZVE32F-NEXT: .LBB48_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB48_10 ; RV32ZVE32F-NEXT: .LBB48_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB48_11 ; RV32ZVE32F-NEXT: .LBB48_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB48_12 ; RV32ZVE32F-NEXT: .LBB48_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB48_13 ; RV32ZVE32F-NEXT: .LBB48_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB48_2 ; RV32ZVE32F-NEXT: .LBB48_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB48_3 ; RV32ZVE32F-NEXT: .LBB48_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB48_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -4356,42 +4356,42 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB48_17 ; RV32ZVE32F-NEXT: .LBB48_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB48_18 ; RV32ZVE32F-NEXT: .LBB48_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB48_15 ; RV32ZVE32F-NEXT: .LBB48_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB48_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4546,77 +4546,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB49_8 ; RV32ZVE32F-NEXT: .LBB49_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB49_9 ; RV32ZVE32F-NEXT: .LBB49_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB49_10 ; RV32ZVE32F-NEXT: .LBB49_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB49_11 ; RV32ZVE32F-NEXT: .LBB49_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB49_12 ; RV32ZVE32F-NEXT: .LBB49_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB49_13 ; RV32ZVE32F-NEXT: .LBB49_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB49_2 ; RV32ZVE32F-NEXT: .LBB49_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB49_3 ; RV32ZVE32F-NEXT: .LBB49_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB49_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -4630,42 +4630,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB49_17 ; RV32ZVE32F-NEXT: .LBB49_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB49_18 ; RV32ZVE32F-NEXT: .LBB49_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB49_15 ; RV32ZVE32F-NEXT: .LBB49_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB49_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -4822,77 +4822,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB50_8 ; RV32ZVE32F-NEXT: .LBB50_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB50_9 ; RV32ZVE32F-NEXT: .LBB50_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB50_10 ; RV32ZVE32F-NEXT: .LBB50_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB50_11 ; RV32ZVE32F-NEXT: .LBB50_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB50_12 ; RV32ZVE32F-NEXT: .LBB50_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB50_13 ; RV32ZVE32F-NEXT: .LBB50_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB50_2 ; RV32ZVE32F-NEXT: .LBB50_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB50_3 ; RV32ZVE32F-NEXT: .LBB50_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB50_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -4906,42 +4906,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB50_17 ; RV32ZVE32F-NEXT: .LBB50_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB50_18 ; RV32ZVE32F-NEXT: .LBB50_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB50_15 ; RV32ZVE32F-NEXT: .LBB50_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB50_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5105,77 +5105,77 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB51_8 ; RV32ZVE32F-NEXT: .LBB51_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB51_9 ; RV32ZVE32F-NEXT: .LBB51_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB51_10 ; RV32ZVE32F-NEXT: .LBB51_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB51_11 ; RV32ZVE32F-NEXT: .LBB51_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB51_12 ; RV32ZVE32F-NEXT: .LBB51_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB51_13 ; RV32ZVE32F-NEXT: .LBB51_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB51_2 ; RV32ZVE32F-NEXT: .LBB51_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB51_3 ; RV32ZVE32F-NEXT: .LBB51_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB51_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -5189,42 +5189,42 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB51_17 ; RV32ZVE32F-NEXT: .LBB51_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB51_18 ; RV32ZVE32F-NEXT: .LBB51_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB51_15 ; RV32ZVE32F-NEXT: .LBB51_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB51_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5380,77 +5380,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB52_8 ; RV32ZVE32F-NEXT: .LBB52_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB52_9 ; RV32ZVE32F-NEXT: .LBB52_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB52_10 ; RV32ZVE32F-NEXT: .LBB52_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB52_11 ; RV32ZVE32F-NEXT: .LBB52_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB52_12 ; RV32ZVE32F-NEXT: .LBB52_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB52_13 ; RV32ZVE32F-NEXT: .LBB52_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB52_2 ; RV32ZVE32F-NEXT: .LBB52_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB52_3 ; RV32ZVE32F-NEXT: .LBB52_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB52_4 ; RV32ZVE32F-NEXT: .LBB52_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB52_5 ; RV32ZVE32F-NEXT: .LBB52_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB52_6 ; RV32ZVE32F-NEXT: .LBB52_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB52_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -5464,42 +5464,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB52_17 ; RV32ZVE32F-NEXT: .LBB52_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB52_18 ; RV32ZVE32F-NEXT: .LBB52_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB52_15 ; RV32ZVE32F-NEXT: .LBB52_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB52_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5657,77 +5657,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB53_8 ; RV32ZVE32F-NEXT: .LBB53_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB53_9 ; RV32ZVE32F-NEXT: .LBB53_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB53_10 ; RV32ZVE32F-NEXT: .LBB53_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB53_11 ; RV32ZVE32F-NEXT: .LBB53_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB53_12 ; RV32ZVE32F-NEXT: .LBB53_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB53_13 ; RV32ZVE32F-NEXT: .LBB53_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB53_2 ; RV32ZVE32F-NEXT: .LBB53_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB53_3 ; RV32ZVE32F-NEXT: .LBB53_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB53_4 ; RV32ZVE32F-NEXT: .LBB53_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB53_5 ; RV32ZVE32F-NEXT: .LBB53_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB53_6 ; RV32ZVE32F-NEXT: .LBB53_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB53_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -5741,42 +5741,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB53_17 ; RV32ZVE32F-NEXT: .LBB53_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB53_18 ; RV32ZVE32F-NEXT: .LBB53_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB53_15 ; RV32ZVE32F-NEXT: .LBB53_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB53_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -5941,77 +5941,77 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB54_8 ; RV32ZVE32F-NEXT: .LBB54_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB54_9 ; RV32ZVE32F-NEXT: .LBB54_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB54_10 ; RV32ZVE32F-NEXT: .LBB54_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB54_11 ; RV32ZVE32F-NEXT: .LBB54_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB54_12 ; RV32ZVE32F-NEXT: .LBB54_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB54_13 ; RV32ZVE32F-NEXT: .LBB54_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB54_2 ; RV32ZVE32F-NEXT: .LBB54_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB54_3 ; RV32ZVE32F-NEXT: .LBB54_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB54_4 ; RV32ZVE32F-NEXT: .LBB54_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB54_5 ; RV32ZVE32F-NEXT: .LBB54_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB54_6 ; RV32ZVE32F-NEXT: .LBB54_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB54_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -6025,42 +6025,42 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB54_17 ; RV32ZVE32F-NEXT: .LBB54_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB54_18 ; RV32ZVE32F-NEXT: .LBB54_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB54_15 ; RV32ZVE32F-NEXT: .LBB54_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB54_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6214,77 +6214,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB55_8 ; RV32ZVE32F-NEXT: .LBB55_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB55_9 ; RV32ZVE32F-NEXT: .LBB55_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB55_10 ; RV32ZVE32F-NEXT: .LBB55_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB55_11 ; RV32ZVE32F-NEXT: .LBB55_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB55_12 ; RV32ZVE32F-NEXT: .LBB55_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB55_13 ; RV32ZVE32F-NEXT: .LBB55_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB55_2 ; RV32ZVE32F-NEXT: .LBB55_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB55_3 ; RV32ZVE32F-NEXT: .LBB55_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB55_4 ; RV32ZVE32F-NEXT: .LBB55_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB55_5 ; RV32ZVE32F-NEXT: .LBB55_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB55_6 ; RV32ZVE32F-NEXT: .LBB55_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB55_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -6298,42 +6298,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB55_17 ; RV32ZVE32F-NEXT: .LBB55_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB55_18 ; RV32ZVE32F-NEXT: .LBB55_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB55_15 ; RV32ZVE32F-NEXT: .LBB55_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB55_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6488,77 +6488,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB56_8 ; RV32ZVE32F-NEXT: .LBB56_2: -; RV32ZVE32F-NEXT: lw a4, 12(a2) -; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 12(a2) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB56_9 ; RV32ZVE32F-NEXT: .LBB56_3: -; RV32ZVE32F-NEXT: lw a6, 20(a2) -; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 16(a2) +; RV32ZVE32F-NEXT: lw a7, 20(a2) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB56_10 ; RV32ZVE32F-NEXT: .LBB56_4: -; RV32ZVE32F-NEXT: lw t1, 28(a2) -; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: lw t1, 24(a2) +; RV32ZVE32F-NEXT: lw t2, 28(a2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB56_11 ; RV32ZVE32F-NEXT: .LBB56_5: -; RV32ZVE32F-NEXT: lw t3, 36(a2) -; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: lw t3, 32(a2) +; RV32ZVE32F-NEXT: lw t4, 36(a2) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB56_12 ; RV32ZVE32F-NEXT: .LBB56_6: -; RV32ZVE32F-NEXT: lw t5, 44(a2) -; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: lw t5, 40(a2) +; RV32ZVE32F-NEXT: lw t6, 44(a2) ; RV32ZVE32F-NEXT: j .LBB56_13 ; RV32ZVE32F-NEXT: .LBB56_7: -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a3, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB56_2 ; RV32ZVE32F-NEXT: .LBB56_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB56_3 ; RV32ZVE32F-NEXT: .LBB56_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB56_4 ; RV32ZVE32F-NEXT: .LBB56_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB56_5 ; RV32ZVE32F-NEXT: .LBB56_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB56_6 ; RV32ZVE32F-NEXT: .LBB56_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB56_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -6572,42 +6572,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB56_17 ; RV32ZVE32F-NEXT: .LBB56_15: -; RV32ZVE32F-NEXT: lw t0, 60(a2) -; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a2, 60(a2) ; RV32ZVE32F-NEXT: j .LBB56_18 ; RV32ZVE32F-NEXT: .LBB56_16: -; RV32ZVE32F-NEXT: lw s0, 52(a2) -; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: lw s0, 48(a2) +; RV32ZVE32F-NEXT: lw s1, 52(a2) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB56_15 ; RV32ZVE32F-NEXT: .LBB56_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: .LBB56_18: # %else20 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a2, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a2, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -6760,22 +6760,22 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; ; RV32ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a4, 56(a2) -; RV32ZVE32F-NEXT: lw a5, 48(a2) -; RV32ZVE32F-NEXT: lw a6, 40(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 24(a2) ; RV32ZVE32F-NEXT: lw a7, 32(a2) -; RV32ZVE32F-NEXT: lw t0, 24(a2) -; RV32ZVE32F-NEXT: lw t1, 16(a2) -; RV32ZVE32F-NEXT: lw t2, 8(a2) +; RV32ZVE32F-NEXT: lw t0, 40(a2) +; RV32ZVE32F-NEXT: lw t1, 48(a2) +; RV32ZVE32F-NEXT: lw t2, 56(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma @@ -6785,77 +6785,77 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: lw a1, 4(a2) -; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: lw a1, 0(a2) +; RV32ZVE32F-NEXT: lw a2, 4(a2) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB57_8 ; RV32ZVE32F-NEXT: .LBB57_2: -; RV32ZVE32F-NEXT: lw a4, 12(a3) -; RV32ZVE32F-NEXT: lw a5, 8(a3) +; RV32ZVE32F-NEXT: lw a4, 8(a3) +; RV32ZVE32F-NEXT: lw a5, 12(a3) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: bnez a6, .LBB57_9 ; RV32ZVE32F-NEXT: .LBB57_3: -; RV32ZVE32F-NEXT: lw a6, 20(a3) -; RV32ZVE32F-NEXT: lw a7, 16(a3) +; RV32ZVE32F-NEXT: lw a6, 16(a3) +; RV32ZVE32F-NEXT: lw a7, 20(a3) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB57_10 ; RV32ZVE32F-NEXT: .LBB57_4: -; RV32ZVE32F-NEXT: lw t1, 28(a3) -; RV32ZVE32F-NEXT: lw t2, 24(a3) +; RV32ZVE32F-NEXT: lw t1, 24(a3) +; RV32ZVE32F-NEXT: lw t2, 28(a3) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB57_11 ; RV32ZVE32F-NEXT: .LBB57_5: -; RV32ZVE32F-NEXT: lw t3, 36(a3) -; RV32ZVE32F-NEXT: lw t4, 32(a3) +; RV32ZVE32F-NEXT: lw t3, 32(a3) +; RV32ZVE32F-NEXT: lw t4, 36(a3) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB57_12 ; RV32ZVE32F-NEXT: .LBB57_6: -; RV32ZVE32F-NEXT: lw t5, 44(a3) -; RV32ZVE32F-NEXT: lw t6, 40(a3) +; RV32ZVE32F-NEXT: lw t5, 40(a3) +; RV32ZVE32F-NEXT: lw t6, 44(a3) ; RV32ZVE32F-NEXT: j .LBB57_13 ; RV32ZVE32F-NEXT: .LBB57_7: -; RV32ZVE32F-NEXT: lw a1, 4(a3) -; RV32ZVE32F-NEXT: lw a2, 0(a3) +; RV32ZVE32F-NEXT: lw a1, 0(a3) +; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB57_2 ; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 -; RV32ZVE32F-NEXT: lw a4, 4(a5) -; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: lw a4, 0(a5) +; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB57_3 ; RV32ZVE32F-NEXT: .LBB57_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 4(a7) -; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: lw a6, 0(a7) +; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB57_4 ; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 -; RV32ZVE32F-NEXT: lw t1, 4(t2) -; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: lw t1, 0(t2) +; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB57_5 ; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 -; RV32ZVE32F-NEXT: lw t3, 4(t4) -; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: lw t3, 0(t4) +; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB57_6 ; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s t6, v10 -; RV32ZVE32F-NEXT: lw t5, 4(t6) -; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: lw t5, 0(t6) +; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB57_13: # %else14 ; RV32ZVE32F-NEXT: addi sp, sp, -16 ; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 @@ -6869,42 +6869,42 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 -; RV32ZVE32F-NEXT: lw s0, 4(s1) -; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: lw s0, 0(s1) +; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: bnez t0, .LBB57_17 ; RV32ZVE32F-NEXT: .LBB57_15: -; RV32ZVE32F-NEXT: lw t0, 60(a3) -; RV32ZVE32F-NEXT: lw a3, 56(a3) +; RV32ZVE32F-NEXT: lw t0, 56(a3) +; RV32ZVE32F-NEXT: lw a3, 60(a3) ; RV32ZVE32F-NEXT: j .LBB57_18 ; RV32ZVE32F-NEXT: .LBB57_16: -; RV32ZVE32F-NEXT: lw s0, 52(a3) -; RV32ZVE32F-NEXT: lw s1, 48(a3) +; RV32ZVE32F-NEXT: lw s0, 48(a3) +; RV32ZVE32F-NEXT: lw s1, 52(a3) ; RV32ZVE32F-NEXT: andi t0, t0, -128 ; RV32ZVE32F-NEXT: beqz t0, .LBB57_15 ; RV32ZVE32F-NEXT: .LBB57_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw t0, 4(a3) -; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw t0, 0(a3) +; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: .LBB57_18: # %else20 -; RV32ZVE32F-NEXT: sw a2, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) -; RV32ZVE32F-NEXT: sw a5, 8(a0) -; RV32ZVE32F-NEXT: sw a4, 12(a0) -; RV32ZVE32F-NEXT: sw a7, 16(a0) -; RV32ZVE32F-NEXT: sw a6, 20(a0) -; RV32ZVE32F-NEXT: sw t2, 24(a0) -; RV32ZVE32F-NEXT: sw t1, 28(a0) -; RV32ZVE32F-NEXT: sw t4, 32(a0) -; RV32ZVE32F-NEXT: sw t3, 36(a0) -; RV32ZVE32F-NEXT: sw t6, 40(a0) -; RV32ZVE32F-NEXT: sw t5, 44(a0) -; RV32ZVE32F-NEXT: sw s1, 48(a0) -; RV32ZVE32F-NEXT: sw s0, 52(a0) -; RV32ZVE32F-NEXT: sw a3, 56(a0) -; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a6, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw t1, 24(a0) +; RV32ZVE32F-NEXT: sw t2, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t4, 36(a0) +; RV32ZVE32F-NEXT: sw t5, 40(a0) +; RV32ZVE32F-NEXT: sw t6, 44(a0) +; RV32ZVE32F-NEXT: sw s0, 48(a0) +; RV32ZVE32F-NEXT: sw s1, 52(a0) +; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a3, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: addi sp, sp, 16 @@ -11928,22 +11928,22 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1> ; ; RV32ZVE32F-LABEL: mgather_baseidx_v8f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a3, 56(a2) -; RV32ZVE32F-NEXT: lw a4, 48(a2) -; RV32ZVE32F-NEXT: lw a5, 40(a2) +; RV32ZVE32F-NEXT: lw a3, 8(a2) +; RV32ZVE32F-NEXT: lw a4, 16(a2) +; RV32ZVE32F-NEXT: lw a5, 24(a2) ; RV32ZVE32F-NEXT: lw a6, 32(a2) -; RV32ZVE32F-NEXT: lw a7, 24(a2) -; RV32ZVE32F-NEXT: lw t0, 16(a2) -; RV32ZVE32F-NEXT: lw t1, 8(a2) +; RV32ZVE32F-NEXT: lw a7, 40(a2) +; RV32ZVE32F-NEXT: lw t0, 48(a2) +; RV32ZVE32F-NEXT: lw t1, 56(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 0125c0256162c..3cd2cc0e12696 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -275,9 +275,9 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -335,9 +335,9 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v9 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -407,13 +407,13 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -822,9 +822,9 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -882,9 +882,9 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v9 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -954,13 +954,13 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -1727,9 +1727,9 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -1787,9 +1787,9 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v9 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -1859,13 +1859,13 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -2947,8 +2947,8 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m) { ; ; RV32ZVE32F-LABEL: mscatter_v2i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a2, 12(a0) ; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a2, 12(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v0 ; RV32ZVE32F-NEXT: andi a4, a3, 1 @@ -3014,12 +3014,12 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV32ZVE32F-LABEL: mscatter_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 28(a0) -; RV32ZVE32F-NEXT: lw a2, 24(a0) -; RV32ZVE32F-NEXT: lw a3, 20(a0) -; RV32ZVE32F-NEXT: lw a4, 16(a0) -; RV32ZVE32F-NEXT: lw a7, 12(a0) ; RV32ZVE32F-NEXT: lw a6, 8(a0) +; RV32ZVE32F-NEXT: lw a7, 12(a0) +; RV32ZVE32F-NEXT: lw a3, 16(a0) +; RV32ZVE32F-NEXT: lw a4, 20(a0) +; RV32ZVE32F-NEXT: lw a1, 24(a0) +; RV32ZVE32F-NEXT: lw a2, 28(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a5, v0 ; RV32ZVE32F-NEXT: andi t0, a5, 1 @@ -3056,38 +3056,38 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 -; RV32ZVE32F-NEXT: sw a4, 0(a0) -; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: andi a5, a5, 8 ; RV32ZVE32F-NEXT: beqz a5, .LBB38_4 ; RV32ZVE32F-NEXT: .LBB38_8: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a2, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 24(a1) +; RV64ZVE32F-NEXT: ld a6, 8(a1) ; RV64ZVE32F-NEXT: ld a4, 16(a1) -; RV64ZVE32F-NEXT: ld a7, 8(a1) -; RV64ZVE32F-NEXT: ld a3, 24(a0) -; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld a2, 24(a1) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi t1, a6, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: andi t1, a7, 1 ; RV64ZVE32F-NEXT: bnez t1, .LBB38_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_6 ; RV64ZVE32F-NEXT: .LBB38_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: andi a0, a7, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_7 ; RV64ZVE32F-NEXT: .LBB38_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a6, 8 +; RV64ZVE32F-NEXT: andi a0, a7, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_8 ; RV64ZVE32F-NEXT: .LBB38_4: # %else6 ; RV64ZVE32F-NEXT: ret @@ -3095,15 +3095,15 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: sd a0, 0(a1) -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_2 ; RV64ZVE32F-NEXT: .LBB38_6: # %cond.store1 -; RV64ZVE32F-NEXT: sd t0, 0(a7) -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: sd t0, 0(a6) +; RV64ZVE32F-NEXT: andi a0, a7, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_3 ; RV64ZVE32F-NEXT: .LBB38_7: # %cond.store3 ; RV64ZVE32F-NEXT: sd a5, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a6, 8 +; RV64ZVE32F-NEXT: andi a0, a7, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_4 ; RV64ZVE32F-NEXT: .LBB38_8: # %cond.store5 ; RV64ZVE32F-NEXT: sd a3, 0(a2) @@ -3127,12 +3127,12 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) { ; ; RV32ZVE32F-LABEL: mscatter_truemask_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 28(a0) -; RV32ZVE32F-NEXT: lw a2, 24(a0) -; RV32ZVE32F-NEXT: lw a3, 20(a0) -; RV32ZVE32F-NEXT: lw a4, 16(a0) -; RV32ZVE32F-NEXT: lw a7, 12(a0) ; RV32ZVE32F-NEXT: lw a6, 8(a0) +; RV32ZVE32F-NEXT: lw a7, 12(a0) +; RV32ZVE32F-NEXT: lw a3, 16(a0) +; RV32ZVE32F-NEXT: lw a4, 20(a0) +; RV32ZVE32F-NEXT: lw a1, 24(a0) +; RV32ZVE32F-NEXT: lw a2, 28(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmset.m v9 ; RV32ZVE32F-NEXT: vmv.x.s a5, v9 @@ -3169,26 +3169,26 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) { ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v9 -; RV32ZVE32F-NEXT: sw a4, 0(a0) -; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: andi a5, a5, 8 ; RV32ZVE32F-NEXT: beqz a5, .LBB39_4 ; RV32ZVE32F-NEXT: .LBB39_8: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a2, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 24(a1) -; RV64ZVE32F-NEXT: ld a4, 16(a1) ; RV64ZVE32F-NEXT: ld a7, 8(a1) -; RV64ZVE32F-NEXT: ld a3, 24(a0) -; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 16(a1) +; RV64ZVE32F-NEXT: ld a2, 24(a1) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v8 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -3260,51 +3260,51 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a1, 60(a0) -; RV32ZVE32F-NEXT: lw a2, 56(a0) -; RV32ZVE32F-NEXT: lw a3, 52(a0) -; RV32ZVE32F-NEXT: lw a4, 48(a0) -; RV32ZVE32F-NEXT: lw a5, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a3, 48(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a1, 56(a0) +; RV32ZVE32F-NEXT: lw a2, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi s1, a6, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v0 +; RV32ZVE32F-NEXT: andi s1, a5, 1 ; RV32ZVE32F-NEXT: bnez s1, .LBB41_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_11 ; RV32ZVE32F-NEXT: .LBB41_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_12 ; RV32ZVE32F-NEXT: .LBB41_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_13 ; RV32ZVE32F-NEXT: .LBB41_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_14 ; RV32ZVE32F-NEXT: .LBB41_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_15 ; RV32ZVE32F-NEXT: .LBB41_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_16 ; RV32ZVE32F-NEXT: .LBB41_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_9 ; RV32ZVE32F-NEXT: .LBB41_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a2, 0(a0) -; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: .LBB41_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -3318,7 +3318,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) -; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_2 ; RV32ZVE32F-NEXT: .LBB41_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -3326,47 +3326,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw s0, 4(a0) ; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: andi a0, a5, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_3 ; RV32ZVE32F-NEXT: .LBB41_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_4 ; RV32ZVE32F-NEXT: .LBB41_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_5 ; RV32ZVE32F-NEXT: .LBB41_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_6 ; RV32ZVE32F-NEXT: .LBB41_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB41_7 ; RV32ZVE32F-NEXT: .LBB41_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a4, 0(a0) -; RV32ZVE32F-NEXT: sw a3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_8 ; RV32ZVE32F-NEXT: j .LBB41_9 ; @@ -3380,47 +3380,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV64ZVE32F-NEXT: .cfi_offset s0, -8 ; RV64ZVE32F-NEXT: .cfi_offset s1, -16 ; RV64ZVE32F-NEXT: .cfi_offset s2, -24 +; RV64ZVE32F-NEXT: ld t5, 8(a1) +; RV64ZVE32F-NEXT: ld t3, 16(a1) +; RV64ZVE32F-NEXT: ld t1, 24(a1) +; RV64ZVE32F-NEXT: ld a6, 32(a1) +; RV64ZVE32F-NEXT: ld a4, 40(a1) +; RV64ZVE32F-NEXT: ld a3, 48(a1) ; RV64ZVE32F-NEXT: ld a2, 56(a1) -; RV64ZVE32F-NEXT: ld a4, 48(a1) -; RV64ZVE32F-NEXT: ld a6, 40(a1) -; RV64ZVE32F-NEXT: ld t1, 32(a1) -; RV64ZVE32F-NEXT: ld t3, 24(a1) -; RV64ZVE32F-NEXT: ld t5, 16(a1) -; RV64ZVE32F-NEXT: ld s0, 8(a1) -; RV64ZVE32F-NEXT: ld a3, 56(a0) -; RV64ZVE32F-NEXT: ld a5, 48(a0) -; RV64ZVE32F-NEXT: ld t0, 40(a0) -; RV64ZVE32F-NEXT: ld t2, 32(a0) -; RV64ZVE32F-NEXT: ld t4, 24(a0) -; RV64ZVE32F-NEXT: ld t6, 16(a0) ; RV64ZVE32F-NEXT: ld s1, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi s2, a7, 1 +; RV64ZVE32F-NEXT: ld s0, 16(a0) +; RV64ZVE32F-NEXT: ld t6, 24(a0) +; RV64ZVE32F-NEXT: ld t4, 32(a0) +; RV64ZVE32F-NEXT: ld t2, 40(a0) +; RV64ZVE32F-NEXT: ld a7, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 56(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s t0, v0 +; RV64ZVE32F-NEXT: andi s2, t0, 1 ; RV64ZVE32F-NEXT: bnez s2, .LBB41_10 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: andi a0, t0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_11 ; RV64ZVE32F-NEXT: .LBB41_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: andi a0, t0, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_12 ; RV64ZVE32F-NEXT: .LBB41_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: andi a0, t0, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_13 ; RV64ZVE32F-NEXT: .LBB41_4: # %else6 -; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: andi a0, t0, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_14 ; RV64ZVE32F-NEXT: .LBB41_5: # %else8 -; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: andi a0, t0, 32 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_15 ; RV64ZVE32F-NEXT: .LBB41_6: # %else10 -; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: andi a0, t0, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_16 ; RV64ZVE32F-NEXT: .LBB41_7: # %else12 -; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: andi a0, t0, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_9 ; RV64ZVE32F-NEXT: .LBB41_8: # %cond.store13 -; RV64ZVE32F-NEXT: sd a3, 0(a2) +; RV64ZVE32F-NEXT: sd a5, 0(a2) ; RV64ZVE32F-NEXT: .LBB41_9: # %else14 ; RV64ZVE32F-NEXT: ld s0, 24(sp) # 8-byte Folded Reload ; RV64ZVE32F-NEXT: ld s1, 16(sp) # 8-byte Folded Reload @@ -3431,31 +3431,31 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: sd a0, 0(a1) -; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: andi a0, t0, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_2 ; RV64ZVE32F-NEXT: .LBB41_11: # %cond.store1 -; RV64ZVE32F-NEXT: sd s1, 0(s0) -; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: sd s1, 0(t5) +; RV64ZVE32F-NEXT: andi a0, t0, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_3 ; RV64ZVE32F-NEXT: .LBB41_12: # %cond.store3 -; RV64ZVE32F-NEXT: sd t6, 0(t5) -; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: sd s0, 0(t3) +; RV64ZVE32F-NEXT: andi a0, t0, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_4 ; RV64ZVE32F-NEXT: .LBB41_13: # %cond.store5 -; RV64ZVE32F-NEXT: sd t4, 0(t3) -; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: sd t6, 0(t1) +; RV64ZVE32F-NEXT: andi a0, t0, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_5 ; RV64ZVE32F-NEXT: .LBB41_14: # %cond.store7 -; RV64ZVE32F-NEXT: sd t2, 0(t1) -; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: sd t4, 0(a6) +; RV64ZVE32F-NEXT: andi a0, t0, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_6 ; RV64ZVE32F-NEXT: .LBB41_15: # %cond.store9 -; RV64ZVE32F-NEXT: sd t0, 0(a6) -; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: sd t2, 0(a4) +; RV64ZVE32F-NEXT: andi a0, t0, 64 ; RV64ZVE32F-NEXT: beqz a0, .LBB41_7 ; RV64ZVE32F-NEXT: .LBB41_16: # %cond.store11 -; RV64ZVE32F-NEXT: sd a5, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: sd a7, 0(a3) +; RV64ZVE32F-NEXT: andi a0, t0, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_8 ; RV64ZVE32F-NEXT: j .LBB41_9 call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, i32 8, <8 x i1> %m) @@ -3490,20 +3490,20 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -3537,8 +3537,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB42_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -3566,53 +3566,53 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_4 ; RV32ZVE32F-NEXT: .LBB42_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_5 ; RV32ZVE32F-NEXT: .LBB42_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_6 ; RV32ZVE32F-NEXT: .LBB42_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB42_7 ; RV32ZVE32F-NEXT: .LBB42_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB42_8 ; RV32ZVE32F-NEXT: j .LBB42_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -3734,20 +3734,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -3781,8 +3781,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB43_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -3810,53 +3810,53 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_4 ; RV32ZVE32F-NEXT: .LBB43_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_5 ; RV32ZVE32F-NEXT: .LBB43_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_6 ; RV32ZVE32F-NEXT: .LBB43_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB43_7 ; RV32ZVE32F-NEXT: .LBB43_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB43_8 ; RV32ZVE32F-NEXT: j .LBB43_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -3980,20 +3980,20 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -4027,8 +4027,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB44_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -4056,53 +4056,53 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_4 ; RV32ZVE32F-NEXT: .LBB44_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_5 ; RV32ZVE32F-NEXT: .LBB44_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_6 ; RV32ZVE32F-NEXT: .LBB44_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB44_7 ; RV32ZVE32F-NEXT: .LBB44_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB44_8 ; RV32ZVE32F-NEXT: j .LBB44_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -4233,20 +4233,20 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -4280,8 +4280,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB45_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -4309,53 +4309,53 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_5 ; RV32ZVE32F-NEXT: .LBB45_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_6 ; RV32ZVE32F-NEXT: .LBB45_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_7 ; RV32ZVE32F-NEXT: .LBB45_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_8 ; RV32ZVE32F-NEXT: j .LBB45_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -4478,20 +4478,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -4525,8 +4525,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB46_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -4554,53 +4554,53 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_4 ; RV32ZVE32F-NEXT: .LBB46_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_5 ; RV32ZVE32F-NEXT: .LBB46_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_6 ; RV32ZVE32F-NEXT: .LBB46_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_7 ; RV32ZVE32F-NEXT: .LBB46_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_8 ; RV32ZVE32F-NEXT: j .LBB46_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -4725,20 +4725,20 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 @@ -4772,8 +4772,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB47_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -4801,53 +4801,53 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_7 ; RV32ZVE32F-NEXT: .LBB47_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_8 ; RV32ZVE32F-NEXT: j .LBB47_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a6, 40(a0) -; RV64ZVE32F-NEXT: ld a7, 32(a0) -; RV64ZVE32F-NEXT: ld t0, 24(a0) -; RV64ZVE32F-NEXT: ld t1, 16(a0) ; RV64ZVE32F-NEXT: ld t2, 8(a0) +; RV64ZVE32F-NEXT: ld t1, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: lui a4, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 @@ -4980,20 +4980,20 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5026,8 +5026,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB48_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -5055,53 +5055,53 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_7 ; RV32ZVE32F-NEXT: .LBB48_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_8 ; RV32ZVE32F-NEXT: j .LBB48_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -5223,20 +5223,20 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5269,8 +5269,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB49_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -5298,53 +5298,53 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_7 ; RV32ZVE32F-NEXT: .LBB49_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_8 ; RV32ZVE32F-NEXT: j .LBB49_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -5467,20 +5467,20 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: .cfi_offset s2, -12 -; RV32ZVE32F-NEXT: lw a2, 60(a0) -; RV32ZVE32F-NEXT: lw a3, 56(a0) -; RV32ZVE32F-NEXT: lw a4, 52(a0) -; RV32ZVE32F-NEXT: lw a5, 48(a0) -; RV32ZVE32F-NEXT: lw a6, 44(a0) -; RV32ZVE32F-NEXT: lw a7, 40(a0) -; RV32ZVE32F-NEXT: lw t0, 36(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 28(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 20(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) ; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw a6, 40(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 60(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 @@ -5513,8 +5513,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a3, 0(a0) -; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) ; RV32ZVE32F-NEXT: .LBB50_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -5542,53 +5542,53 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a7, 0(a0) -; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_7 ; RV32ZVE32F-NEXT: .LBB50_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a5, 0(a0) -; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_8 ; RV32ZVE32F-NEXT: j .LBB50_9 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a2, 56(a0) -; RV64ZVE32F-NEXT: ld a3, 48(a0) -; RV64ZVE32F-NEXT: ld a5, 40(a0) -; RV64ZVE32F-NEXT: ld a6, 32(a0) -; RV64ZVE32F-NEXT: ld a7, 24(a0) -; RV64ZVE32F-NEXT: ld t0, 16(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 ; RV64ZVE32F-NEXT: andi t2, a4, 1 @@ -5731,36 +5731,36 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: .cfi_offset s6, -28 ; RV32ZVE32F-NEXT: .cfi_offset s7, -32 ; RV32ZVE32F-NEXT: .cfi_offset s8, -36 -; RV32ZVE32F-NEXT: lw a3, 60(a0) -; RV32ZVE32F-NEXT: lw a4, 56(a0) -; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw a6, 48(a0) -; RV32ZVE32F-NEXT: lw a7, 44(a0) -; RV32ZVE32F-NEXT: lw t0, 40(a0) -; RV32ZVE32F-NEXT: lw t1, 36(a0) -; RV32ZVE32F-NEXT: lw t2, 32(a0) -; RV32ZVE32F-NEXT: lw t3, 28(a0) -; RV32ZVE32F-NEXT: lw t4, 24(a0) -; RV32ZVE32F-NEXT: lw t5, 20(a0) -; RV32ZVE32F-NEXT: lw t6, 16(a0) -; RV32ZVE32F-NEXT: lw s1, 12(a0) ; RV32ZVE32F-NEXT: lw s0, 8(a0) -; RV32ZVE32F-NEXT: lw s2, 56(a2) -; RV32ZVE32F-NEXT: lw s3, 48(a2) -; RV32ZVE32F-NEXT: lw s4, 40(a2) +; RV32ZVE32F-NEXT: lw s1, 12(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw t6, 20(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 28(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 44(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 52(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 60(a0) +; RV32ZVE32F-NEXT: lw s2, 8(a2) +; RV32ZVE32F-NEXT: lw s3, 16(a2) +; RV32ZVE32F-NEXT: lw s4, 24(a2) ; RV32ZVE32F-NEXT: lw s5, 32(a2) -; RV32ZVE32F-NEXT: lw s6, 24(a2) -; RV32ZVE32F-NEXT: lw s7, 16(a2) -; RV32ZVE32F-NEXT: lw s8, 8(a2) +; RV32ZVE32F-NEXT: lw s6, 40(a2) +; RV32ZVE32F-NEXT: lw s7, 48(a2) +; RV32ZVE32F-NEXT: lw s8, 56(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vlse32.v v8, (a2), zero -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s6 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s3 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma @@ -5792,8 +5792,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw a4, 0(a0) -; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) ; RV32ZVE32F-NEXT: .LBB51_9: # %else14 ; RV32ZVE32F-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 40(sp) # 4-byte Folded Reload @@ -5827,40 +5827,40 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: sw t5, 4(a0) +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t6, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t4, 0(a0) -; RV32ZVE32F-NEXT: sw t3, 4(a0) +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t2, 0(a0) -; RV32ZVE32F-NEXT: sw t1, 4(a0) +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t0, 0(a0) -; RV32ZVE32F-NEXT: sw a7, 4(a0) +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB51_7 ; RV32ZVE32F-NEXT: .LBB51_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw a6, 0(a0) -; RV32ZVE32F-NEXT: sw a5, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) ; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB51_8 ; RV32ZVE32F-NEXT: j .LBB51_9 @@ -5877,16 +5877,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV64ZVE32F-NEXT: .cfi_offset s1, -16 ; RV64ZVE32F-NEXT: .cfi_offset s2, -24 ; RV64ZVE32F-NEXT: .cfi_offset s3, -32 -; RV64ZVE32F-NEXT: ld a3, 56(a0) -; RV64ZVE32F-NEXT: ld a4, 48(a0) -; RV64ZVE32F-NEXT: ld a6, 40(a0) -; RV64ZVE32F-NEXT: ld t1, 32(a0) +; RV64ZVE32F-NEXT: ld s0, 8(a0) +; RV64ZVE32F-NEXT: ld t5, 16(a0) ; RV64ZVE32F-NEXT: ld t3, 24(a0) -; RV64ZVE32F-NEXT: ld t6, 16(a0) -; RV64ZVE32F-NEXT: ld s1, 8(a0) +; RV64ZVE32F-NEXT: ld t1, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 40(a0) +; RV64ZVE32F-NEXT: ld a4, 48(a0) +; RV64ZVE32F-NEXT: ld a3, 56(a0) ; RV64ZVE32F-NEXT: ld s2, 8(a2) -; RV64ZVE32F-NEXT: ld s0, 16(a2) -; RV64ZVE32F-NEXT: ld t5, 24(a2) +; RV64ZVE32F-NEXT: ld s1, 16(a2) +; RV64ZVE32F-NEXT: ld t6, 24(a2) ; RV64ZVE32F-NEXT: ld t4, 32(a2) ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: ld t0, 48(a2) @@ -5938,19 +5938,19 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV64ZVE32F-NEXT: .LBB51_11: # %cond.store1 ; RV64ZVE32F-NEXT: slli s2, s2, 3 ; RV64ZVE32F-NEXT: add s2, a1, s2 -; RV64ZVE32F-NEXT: sd s1, 0(s2) +; RV64ZVE32F-NEXT: sd s0, 0(s2) ; RV64ZVE32F-NEXT: andi a0, a7, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB51_3 ; RV64ZVE32F-NEXT: .LBB51_12: # %cond.store3 -; RV64ZVE32F-NEXT: slli s0, s0, 3 -; RV64ZVE32F-NEXT: add s0, a1, s0 -; RV64ZVE32F-NEXT: sd t6, 0(s0) +; RV64ZVE32F-NEXT: slli s1, s1, 3 +; RV64ZVE32F-NEXT: add s1, a1, s1 +; RV64ZVE32F-NEXT: sd t5, 0(s1) ; RV64ZVE32F-NEXT: andi a0, a7, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB51_4 ; RV64ZVE32F-NEXT: .LBB51_13: # %cond.store5 -; RV64ZVE32F-NEXT: slli t5, t5, 3 -; RV64ZVE32F-NEXT: add t5, a1, t5 -; RV64ZVE32F-NEXT: sd t3, 0(t5) +; RV64ZVE32F-NEXT: slli t6, t6, 3 +; RV64ZVE32F-NEXT: add t6, a1, t6 +; RV64ZVE32F-NEXT: sd t3, 0(t6) ; RV64ZVE32F-NEXT: andi a0, a7, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB51_5 ; RV64ZVE32F-NEXT: .LBB51_14: # %cond.store7 @@ -6075,9 +6075,9 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -6135,9 +6135,9 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v9 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -6207,13 +6207,13 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -6927,9 +6927,9 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -6987,9 +6987,9 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v9 ; RV64ZVE32F-NEXT: vmv.x.s a3, v9 @@ -7059,13 +7059,13 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -8283,9 +8283,9 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x ptr> %ptrs, <4 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi a5, a3, 1 @@ -8380,9 +8380,9 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) { ; ; RV64ZVE32F-LABEL: mscatter_truemask_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 24(a0) -; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmset.m v8 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -8529,13 +8529,13 @@ define void @mscatter_v8f64(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m) { ; ; RV64ZVE32F-LABEL: mscatter_v8f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a1, 56(a0) -; RV64ZVE32F-NEXT: ld a2, 48(a0) -; RV64ZVE32F-NEXT: ld a4, 40(a0) -; RV64ZVE32F-NEXT: ld a5, 32(a0) -; RV64ZVE32F-NEXT: ld a6, 24(a0) -; RV64ZVE32F-NEXT: ld a7, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a1, 56(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 ; RV64ZVE32F-NEXT: andi t1, a3, 1 @@ -10452,22 +10452,22 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a2, 56(a1) -; RV32ZVE32F-NEXT: lw a3, 48(a1) -; RV32ZVE32F-NEXT: lw a4, 40(a1) +; RV32ZVE32F-NEXT: lw a2, 8(a1) +; RV32ZVE32F-NEXT: lw a3, 16(a1) +; RV32ZVE32F-NEXT: lw a4, 24(a1) ; RV32ZVE32F-NEXT: lw a5, 32(a1) -; RV32ZVE32F-NEXT: lw a6, 24(a1) -; RV32ZVE32F-NEXT: lw a7, 16(a1) -; RV32ZVE32F-NEXT: lw t0, 8(a1) +; RV32ZVE32F-NEXT: lw a6, 40(a1) +; RV32ZVE32F-NEXT: lw a7, 48(a1) +; RV32ZVE32F-NEXT: lw t0, 56(a1) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vlse32.v v8, (a1), zero -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 2ff2529e259a8..412d759beb713 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1322,21 +1322,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 486(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 51(sp) -; ZVFHMIN32-NEXT: flh fa5, 228(sp) -; ZVFHMIN32-NEXT: flh fa4, 484(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 222(sp) +; ZVFHMIN32-NEXT: flh fa4, 224(sp) +; ZVFHMIN32-NEXT: flh fa3, 226(sp) +; ZVFHMIN32-NEXT: flh fa2, 228(sp) +; ZVFHMIN32-NEXT: flh fa1, 484(sp) +; ZVFHMIN32-NEXT: flh fa0, 482(sp) +; ZVFHMIN32-NEXT: flh ft0, 478(sp) +; ZVFHMIN32-NEXT: flh ft1, 480(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 50(sp) -; ZVFHMIN32-NEXT: flh fa5, 226(sp) -; ZVFHMIN32-NEXT: flh fa4, 482(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 49(sp) -; ZVFHMIN32-NEXT: flh fa5, 224(sp) -; ZVFHMIN32-NEXT: flh fa4, 480(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 48(sp) -; ZVFHMIN32-NEXT: flh fa5, 222(sp) -; ZVFHMIN32-NEXT: flh fa4, 478(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 47(sp) ; ZVFHMIN32-NEXT: flh fa5, 382(sp) ; ZVFHMIN32-NEXT: flh fa4, 638(sp) @@ -1390,21 +1390,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 614(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 115(sp) -; ZVFHMIN32-NEXT: flh fa5, 356(sp) -; ZVFHMIN32-NEXT: flh fa4, 612(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 350(sp) +; ZVFHMIN32-NEXT: flh fa4, 352(sp) +; ZVFHMIN32-NEXT: flh fa3, 354(sp) +; ZVFHMIN32-NEXT: flh fa2, 356(sp) +; ZVFHMIN32-NEXT: flh fa1, 612(sp) +; ZVFHMIN32-NEXT: flh fa0, 610(sp) +; ZVFHMIN32-NEXT: flh ft0, 606(sp) +; ZVFHMIN32-NEXT: flh ft1, 608(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 114(sp) -; ZVFHMIN32-NEXT: flh fa5, 354(sp) -; ZVFHMIN32-NEXT: flh fa4, 610(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 113(sp) -; ZVFHMIN32-NEXT: flh fa5, 352(sp) -; ZVFHMIN32-NEXT: flh fa4, 608(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 112(sp) -; ZVFHMIN32-NEXT: flh fa5, 350(sp) -; ZVFHMIN32-NEXT: flh fa4, 606(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 111(sp) ; ZVFHMIN32-NEXT: flh fa5, 220(sp) ; ZVFHMIN32-NEXT: flh fa4, 476(sp) @@ -1458,21 +1458,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 452(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 34(sp) -; ZVFHMIN32-NEXT: flh fa5, 194(sp) -; ZVFHMIN32-NEXT: flh fa4, 450(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 188(sp) +; ZVFHMIN32-NEXT: flh fa4, 190(sp) +; ZVFHMIN32-NEXT: flh fa3, 192(sp) +; ZVFHMIN32-NEXT: flh fa2, 194(sp) +; ZVFHMIN32-NEXT: flh fa1, 450(sp) +; ZVFHMIN32-NEXT: flh fa0, 448(sp) +; ZVFHMIN32-NEXT: flh ft0, 444(sp) +; ZVFHMIN32-NEXT: flh ft1, 446(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 33(sp) -; ZVFHMIN32-NEXT: flh fa5, 192(sp) -; ZVFHMIN32-NEXT: flh fa4, 448(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 32(sp) -; ZVFHMIN32-NEXT: flh fa5, 190(sp) -; ZVFHMIN32-NEXT: flh fa4, 446(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 31(sp) -; ZVFHMIN32-NEXT: flh fa5, 188(sp) -; ZVFHMIN32-NEXT: flh fa4, 444(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 30(sp) ; ZVFHMIN32-NEXT: flh fa5, 348(sp) ; ZVFHMIN32-NEXT: flh fa4, 604(sp) @@ -1526,21 +1526,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 580(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 98(sp) -; ZVFHMIN32-NEXT: flh fa5, 322(sp) -; ZVFHMIN32-NEXT: flh fa4, 578(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 316(sp) +; ZVFHMIN32-NEXT: flh fa4, 318(sp) +; ZVFHMIN32-NEXT: flh fa3, 320(sp) +; ZVFHMIN32-NEXT: flh fa2, 322(sp) +; ZVFHMIN32-NEXT: flh fa1, 578(sp) +; ZVFHMIN32-NEXT: flh fa0, 576(sp) +; ZVFHMIN32-NEXT: flh ft0, 572(sp) +; ZVFHMIN32-NEXT: flh ft1, 574(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 97(sp) -; ZVFHMIN32-NEXT: flh fa5, 320(sp) -; ZVFHMIN32-NEXT: flh fa4, 576(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 96(sp) -; ZVFHMIN32-NEXT: flh fa5, 318(sp) -; ZVFHMIN32-NEXT: flh fa4, 574(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 95(sp) -; ZVFHMIN32-NEXT: flh fa5, 316(sp) -; ZVFHMIN32-NEXT: flh fa4, 572(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 94(sp) ; ZVFHMIN32-NEXT: flh fa5, 186(sp) ; ZVFHMIN32-NEXT: flh fa4, 442(sp) @@ -1594,21 +1594,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 418(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 17(sp) -; ZVFHMIN32-NEXT: flh fa5, 160(sp) -; ZVFHMIN32-NEXT: flh fa4, 416(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 154(sp) +; ZVFHMIN32-NEXT: flh fa4, 156(sp) +; ZVFHMIN32-NEXT: flh fa3, 158(sp) +; ZVFHMIN32-NEXT: flh fa2, 160(sp) +; ZVFHMIN32-NEXT: flh fa1, 416(sp) +; ZVFHMIN32-NEXT: flh fa0, 414(sp) +; ZVFHMIN32-NEXT: flh ft0, 410(sp) +; ZVFHMIN32-NEXT: flh ft1, 412(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 16(sp) -; ZVFHMIN32-NEXT: flh fa5, 158(sp) -; ZVFHMIN32-NEXT: flh fa4, 414(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 15(sp) -; ZVFHMIN32-NEXT: flh fa5, 156(sp) -; ZVFHMIN32-NEXT: flh fa4, 412(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 14(sp) -; ZVFHMIN32-NEXT: flh fa5, 154(sp) -; ZVFHMIN32-NEXT: flh fa4, 410(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 13(sp) ; ZVFHMIN32-NEXT: flh fa5, 314(sp) ; ZVFHMIN32-NEXT: flh fa4, 570(sp) @@ -1662,21 +1662,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 546(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 81(sp) -; ZVFHMIN32-NEXT: flh fa5, 288(sp) -; ZVFHMIN32-NEXT: flh fa4, 544(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 282(sp) +; ZVFHMIN32-NEXT: flh fa4, 284(sp) +; ZVFHMIN32-NEXT: flh fa3, 286(sp) +; ZVFHMIN32-NEXT: flh fa2, 288(sp) +; ZVFHMIN32-NEXT: flh fa1, 544(sp) +; ZVFHMIN32-NEXT: flh fa0, 542(sp) +; ZVFHMIN32-NEXT: flh ft0, 538(sp) +; ZVFHMIN32-NEXT: flh ft1, 540(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 80(sp) -; ZVFHMIN32-NEXT: flh fa5, 286(sp) -; ZVFHMIN32-NEXT: flh fa4, 542(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 79(sp) -; ZVFHMIN32-NEXT: flh fa5, 284(sp) -; ZVFHMIN32-NEXT: flh fa4, 540(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 78(sp) -; ZVFHMIN32-NEXT: flh fa5, 282(sp) -; ZVFHMIN32-NEXT: flh fa4, 538(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 77(sp) ; ZVFHMIN32-NEXT: flh fa5, 152(sp) ; ZVFHMIN32-NEXT: flh fa4, 408(sp) @@ -1714,21 +1714,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 392(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 4(sp) -; ZVFHMIN32-NEXT: flh fa5, 134(sp) -; ZVFHMIN32-NEXT: flh fa4, 390(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 128(sp) +; ZVFHMIN32-NEXT: flh fa4, 130(sp) +; ZVFHMIN32-NEXT: flh fa3, 132(sp) +; ZVFHMIN32-NEXT: flh fa2, 134(sp) +; ZVFHMIN32-NEXT: flh fa1, 390(sp) +; ZVFHMIN32-NEXT: flh fa0, 388(sp) +; ZVFHMIN32-NEXT: flh ft0, 384(sp) +; ZVFHMIN32-NEXT: flh ft1, 386(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 3(sp) -; ZVFHMIN32-NEXT: flh fa5, 132(sp) -; ZVFHMIN32-NEXT: flh fa4, 388(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 2(sp) -; ZVFHMIN32-NEXT: flh fa5, 130(sp) -; ZVFHMIN32-NEXT: flh fa4, 386(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 1(sp) -; ZVFHMIN32-NEXT: flh fa5, 128(sp) -; ZVFHMIN32-NEXT: flh fa4, 384(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 0(sp) ; ZVFHMIN32-NEXT: flh fa5, 280(sp) ; ZVFHMIN32-NEXT: flh fa4, 536(sp) @@ -1766,21 +1766,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: flh fa4, 520(sp) ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 68(sp) -; ZVFHMIN32-NEXT: flh fa5, 262(sp) -; ZVFHMIN32-NEXT: flh fa4, 518(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: flh fa5, 256(sp) +; ZVFHMIN32-NEXT: flh fa4, 258(sp) +; ZVFHMIN32-NEXT: flh fa3, 260(sp) +; ZVFHMIN32-NEXT: flh fa2, 262(sp) +; ZVFHMIN32-NEXT: flh fa1, 518(sp) +; ZVFHMIN32-NEXT: flh fa0, 516(sp) +; ZVFHMIN32-NEXT: flh ft0, 512(sp) +; ZVFHMIN32-NEXT: flh ft1, 514(sp) +; ZVFHMIN32-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN32-NEXT: sb a0, 67(sp) -; ZVFHMIN32-NEXT: flh fa5, 260(sp) -; ZVFHMIN32-NEXT: flh fa4, 516(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN32-NEXT: sb a0, 66(sp) -; ZVFHMIN32-NEXT: flh fa5, 258(sp) -; ZVFHMIN32-NEXT: flh fa4, 514(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN32-NEXT: sb a0, 65(sp) -; ZVFHMIN32-NEXT: flh fa5, 256(sp) -; ZVFHMIN32-NEXT: flh fa4, 512(sp) -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN32-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN32-NEXT: sb a0, 64(sp) ; ZVFHMIN32-NEXT: li a0, 128 ; ZVFHMIN32-NEXT: mv a1, sp @@ -1870,21 +1870,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 486(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 51(sp) -; ZVFHMIN64-NEXT: flh fa5, 228(sp) -; ZVFHMIN64-NEXT: flh fa4, 484(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 222(sp) +; ZVFHMIN64-NEXT: flh fa4, 224(sp) +; ZVFHMIN64-NEXT: flh fa3, 226(sp) +; ZVFHMIN64-NEXT: flh fa2, 228(sp) +; ZVFHMIN64-NEXT: flh fa1, 484(sp) +; ZVFHMIN64-NEXT: flh fa0, 482(sp) +; ZVFHMIN64-NEXT: flh ft0, 478(sp) +; ZVFHMIN64-NEXT: flh ft1, 480(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 50(sp) -; ZVFHMIN64-NEXT: flh fa5, 226(sp) -; ZVFHMIN64-NEXT: flh fa4, 482(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 49(sp) -; ZVFHMIN64-NEXT: flh fa5, 224(sp) -; ZVFHMIN64-NEXT: flh fa4, 480(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 48(sp) -; ZVFHMIN64-NEXT: flh fa5, 222(sp) -; ZVFHMIN64-NEXT: flh fa4, 478(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 47(sp) ; ZVFHMIN64-NEXT: flh fa5, 382(sp) ; ZVFHMIN64-NEXT: flh fa4, 638(sp) @@ -1938,21 +1938,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 614(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 115(sp) -; ZVFHMIN64-NEXT: flh fa5, 356(sp) -; ZVFHMIN64-NEXT: flh fa4, 612(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 350(sp) +; ZVFHMIN64-NEXT: flh fa4, 352(sp) +; ZVFHMIN64-NEXT: flh fa3, 354(sp) +; ZVFHMIN64-NEXT: flh fa2, 356(sp) +; ZVFHMIN64-NEXT: flh fa1, 612(sp) +; ZVFHMIN64-NEXT: flh fa0, 610(sp) +; ZVFHMIN64-NEXT: flh ft0, 606(sp) +; ZVFHMIN64-NEXT: flh ft1, 608(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 114(sp) -; ZVFHMIN64-NEXT: flh fa5, 354(sp) -; ZVFHMIN64-NEXT: flh fa4, 610(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 113(sp) -; ZVFHMIN64-NEXT: flh fa5, 352(sp) -; ZVFHMIN64-NEXT: flh fa4, 608(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 112(sp) -; ZVFHMIN64-NEXT: flh fa5, 350(sp) -; ZVFHMIN64-NEXT: flh fa4, 606(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 111(sp) ; ZVFHMIN64-NEXT: flh fa5, 220(sp) ; ZVFHMIN64-NEXT: flh fa4, 476(sp) @@ -2006,21 +2006,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 452(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 34(sp) -; ZVFHMIN64-NEXT: flh fa5, 194(sp) -; ZVFHMIN64-NEXT: flh fa4, 450(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 188(sp) +; ZVFHMIN64-NEXT: flh fa4, 190(sp) +; ZVFHMIN64-NEXT: flh fa3, 192(sp) +; ZVFHMIN64-NEXT: flh fa2, 194(sp) +; ZVFHMIN64-NEXT: flh fa1, 450(sp) +; ZVFHMIN64-NEXT: flh fa0, 448(sp) +; ZVFHMIN64-NEXT: flh ft0, 444(sp) +; ZVFHMIN64-NEXT: flh ft1, 446(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 33(sp) -; ZVFHMIN64-NEXT: flh fa5, 192(sp) -; ZVFHMIN64-NEXT: flh fa4, 448(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 32(sp) -; ZVFHMIN64-NEXT: flh fa5, 190(sp) -; ZVFHMIN64-NEXT: flh fa4, 446(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 31(sp) -; ZVFHMIN64-NEXT: flh fa5, 188(sp) -; ZVFHMIN64-NEXT: flh fa4, 444(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 30(sp) ; ZVFHMIN64-NEXT: flh fa5, 348(sp) ; ZVFHMIN64-NEXT: flh fa4, 604(sp) @@ -2074,21 +2074,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 580(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 98(sp) -; ZVFHMIN64-NEXT: flh fa5, 322(sp) -; ZVFHMIN64-NEXT: flh fa4, 578(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 316(sp) +; ZVFHMIN64-NEXT: flh fa4, 318(sp) +; ZVFHMIN64-NEXT: flh fa3, 320(sp) +; ZVFHMIN64-NEXT: flh fa2, 322(sp) +; ZVFHMIN64-NEXT: flh fa1, 578(sp) +; ZVFHMIN64-NEXT: flh fa0, 576(sp) +; ZVFHMIN64-NEXT: flh ft0, 572(sp) +; ZVFHMIN64-NEXT: flh ft1, 574(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 97(sp) -; ZVFHMIN64-NEXT: flh fa5, 320(sp) -; ZVFHMIN64-NEXT: flh fa4, 576(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 96(sp) -; ZVFHMIN64-NEXT: flh fa5, 318(sp) -; ZVFHMIN64-NEXT: flh fa4, 574(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 95(sp) -; ZVFHMIN64-NEXT: flh fa5, 316(sp) -; ZVFHMIN64-NEXT: flh fa4, 572(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 94(sp) ; ZVFHMIN64-NEXT: flh fa5, 186(sp) ; ZVFHMIN64-NEXT: flh fa4, 442(sp) @@ -2142,21 +2142,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 418(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 17(sp) -; ZVFHMIN64-NEXT: flh fa5, 160(sp) -; ZVFHMIN64-NEXT: flh fa4, 416(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 154(sp) +; ZVFHMIN64-NEXT: flh fa4, 156(sp) +; ZVFHMIN64-NEXT: flh fa3, 158(sp) +; ZVFHMIN64-NEXT: flh fa2, 160(sp) +; ZVFHMIN64-NEXT: flh fa1, 416(sp) +; ZVFHMIN64-NEXT: flh fa0, 414(sp) +; ZVFHMIN64-NEXT: flh ft0, 410(sp) +; ZVFHMIN64-NEXT: flh ft1, 412(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 16(sp) -; ZVFHMIN64-NEXT: flh fa5, 158(sp) -; ZVFHMIN64-NEXT: flh fa4, 414(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 15(sp) -; ZVFHMIN64-NEXT: flh fa5, 156(sp) -; ZVFHMIN64-NEXT: flh fa4, 412(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 14(sp) -; ZVFHMIN64-NEXT: flh fa5, 154(sp) -; ZVFHMIN64-NEXT: flh fa4, 410(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 13(sp) ; ZVFHMIN64-NEXT: flh fa5, 314(sp) ; ZVFHMIN64-NEXT: flh fa4, 570(sp) @@ -2210,21 +2210,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 546(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 81(sp) -; ZVFHMIN64-NEXT: flh fa5, 288(sp) -; ZVFHMIN64-NEXT: flh fa4, 544(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 282(sp) +; ZVFHMIN64-NEXT: flh fa4, 284(sp) +; ZVFHMIN64-NEXT: flh fa3, 286(sp) +; ZVFHMIN64-NEXT: flh fa2, 288(sp) +; ZVFHMIN64-NEXT: flh fa1, 544(sp) +; ZVFHMIN64-NEXT: flh fa0, 542(sp) +; ZVFHMIN64-NEXT: flh ft0, 538(sp) +; ZVFHMIN64-NEXT: flh ft1, 540(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 80(sp) -; ZVFHMIN64-NEXT: flh fa5, 286(sp) -; ZVFHMIN64-NEXT: flh fa4, 542(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 79(sp) -; ZVFHMIN64-NEXT: flh fa5, 284(sp) -; ZVFHMIN64-NEXT: flh fa4, 540(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 78(sp) -; ZVFHMIN64-NEXT: flh fa5, 282(sp) -; ZVFHMIN64-NEXT: flh fa4, 538(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 77(sp) ; ZVFHMIN64-NEXT: flh fa5, 152(sp) ; ZVFHMIN64-NEXT: flh fa4, 408(sp) @@ -2262,21 +2262,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 392(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 4(sp) -; ZVFHMIN64-NEXT: flh fa5, 134(sp) -; ZVFHMIN64-NEXT: flh fa4, 390(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 128(sp) +; ZVFHMIN64-NEXT: flh fa4, 130(sp) +; ZVFHMIN64-NEXT: flh fa3, 132(sp) +; ZVFHMIN64-NEXT: flh fa2, 134(sp) +; ZVFHMIN64-NEXT: flh fa1, 390(sp) +; ZVFHMIN64-NEXT: flh fa0, 388(sp) +; ZVFHMIN64-NEXT: flh ft0, 384(sp) +; ZVFHMIN64-NEXT: flh ft1, 386(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 3(sp) -; ZVFHMIN64-NEXT: flh fa5, 132(sp) -; ZVFHMIN64-NEXT: flh fa4, 388(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 2(sp) -; ZVFHMIN64-NEXT: flh fa5, 130(sp) -; ZVFHMIN64-NEXT: flh fa4, 386(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 1(sp) -; ZVFHMIN64-NEXT: flh fa5, 128(sp) -; ZVFHMIN64-NEXT: flh fa4, 384(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 0(sp) ; ZVFHMIN64-NEXT: flh fa5, 280(sp) ; ZVFHMIN64-NEXT: flh fa4, 536(sp) @@ -2314,21 +2314,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: flh fa4, 520(sp) ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 68(sp) -; ZVFHMIN64-NEXT: flh fa5, 262(sp) -; ZVFHMIN64-NEXT: flh fa4, 518(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: flh fa5, 256(sp) +; ZVFHMIN64-NEXT: flh fa4, 258(sp) +; ZVFHMIN64-NEXT: flh fa3, 260(sp) +; ZVFHMIN64-NEXT: flh fa2, 262(sp) +; ZVFHMIN64-NEXT: flh fa1, 518(sp) +; ZVFHMIN64-NEXT: flh fa0, 516(sp) +; ZVFHMIN64-NEXT: flh ft0, 512(sp) +; ZVFHMIN64-NEXT: flh ft1, 514(sp) +; ZVFHMIN64-NEXT: feq.h a0, fa2, fa1 ; ZVFHMIN64-NEXT: sb a0, 67(sp) -; ZVFHMIN64-NEXT: flh fa5, 260(sp) -; ZVFHMIN64-NEXT: flh fa4, 516(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa3, fa0 ; ZVFHMIN64-NEXT: sb a0, 66(sp) -; ZVFHMIN64-NEXT: flh fa5, 258(sp) -; ZVFHMIN64-NEXT: flh fa4, 514(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa4, ft1 ; ZVFHMIN64-NEXT: sb a0, 65(sp) -; ZVFHMIN64-NEXT: flh fa5, 256(sp) -; ZVFHMIN64-NEXT: flh fa4, 512(sp) -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 +; ZVFHMIN64-NEXT: feq.h a0, fa5, ft0 ; ZVFHMIN64-NEXT: sb a0, 64(sp) ; ZVFHMIN64-NEXT: li a0, 128 ; ZVFHMIN64-NEXT: mv a1, sp diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index eeb188627577d..66331eb4e4010 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -743,18 +743,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; ZVE32F-NEXT: li a5, 40 ; ZVE32F-NEXT: .LBB12_1: # %bb2 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 -; ZVE32F-NEXT: ld a6, 8(a1) -; ZVE32F-NEXT: ld a7, 0(a1) -; ZVE32F-NEXT: ld t0, 24(a1) -; ZVE32F-NEXT: ld t1, 16(a1) +; ZVE32F-NEXT: ld a6, 0(a1) +; ZVE32F-NEXT: ld a7, 8(a1) +; ZVE32F-NEXT: ld t0, 16(a1) +; ZVE32F-NEXT: ld t1, 24(a1) ; ZVE32F-NEXT: mul t2, a4, a5 ; ZVE32F-NEXT: add t2, a0, t2 ; ZVE32F-NEXT: mul t3, a2, a5 ; ZVE32F-NEXT: add t3, a0, t3 -; ZVE32F-NEXT: sd a7, 0(t3) -; ZVE32F-NEXT: sd a6, 0(t2) -; ZVE32F-NEXT: sd t1, 80(t3) -; ZVE32F-NEXT: sd t0, 80(t2) +; ZVE32F-NEXT: sd a6, 0(t3) +; ZVE32F-NEXT: sd a7, 0(t2) +; ZVE32F-NEXT: sd t0, 80(t3) +; ZVE32F-NEXT: sd t1, 80(t2) ; ZVE32F-NEXT: addi a2, a2, 4 ; ZVE32F-NEXT: addi a1, a1, 32 ; ZVE32F-NEXT: addi a4, a4, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 7497051027fa3..afcfc4889c68b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -364,21 +364,21 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs0, -48 ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 -; CHECK-NOV-NEXT: lhu s1, 24(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu a1, 16(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -459,18 +459,18 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -499,7 +499,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -557,17 +557,17 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 24(a1) -; CHECK-NOV-NEXT: lhu s3, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -631,18 +631,18 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -671,7 +671,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -726,17 +726,17 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 24(a1) -; CHECK-NOV-NEXT: lhu s3, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -812,18 +812,18 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -852,7 +852,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -1267,37 +1267,37 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs4, -112 ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 -; CHECK-NOV-NEXT: lhu s1, 56(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu s4, 16(a1) -; CHECK-NOV-NEXT: lhu s5, 24(a1) -; CHECK-NOV-NEXT: lhu s6, 32(a1) -; CHECK-NOV-NEXT: lhu s7, 40(a1) -; CHECK-NOV-NEXT: lhu a1, 48(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu s3, 16(a1) +; CHECK-NOV-NEXT: lhu s4, 24(a1) +; CHECK-NOV-NEXT: lhu s5, 32(a1) +; CHECK-NOV-NEXT: lhu s6, 40(a1) +; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -1448,22 +1448,22 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -1481,7 +1481,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma @@ -1593,33 +1593,33 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 56(a1) -; CHECK-NOV-NEXT: lhu s3, 48(a1) -; CHECK-NOV-NEXT: lhu s4, 40(a1) -; CHECK-NOV-NEXT: lhu s5, 32(a1) -; CHECK-NOV-NEXT: lhu s6, 24(a1) -; CHECK-NOV-NEXT: lhu s7, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) +; CHECK-NOV-NEXT: lhu s4, 32(a1) +; CHECK-NOV-NEXT: lhu s5, 40(a1) +; CHECK-NOV-NEXT: lhu s6, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -1731,22 +1731,22 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -1764,7 +1764,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma @@ -1872,33 +1872,33 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 56(a1) -; CHECK-NOV-NEXT: lhu s3, 48(a1) -; CHECK-NOV-NEXT: lhu s4, 40(a1) -; CHECK-NOV-NEXT: lhu s5, 32(a1) -; CHECK-NOV-NEXT: lhu s6, 24(a1) -; CHECK-NOV-NEXT: lhu s7, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) +; CHECK-NOV-NEXT: lhu s4, 32(a1) +; CHECK-NOV-NEXT: lhu s5, 40(a1) +; CHECK-NOV-NEXT: lhu s6, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -2034,22 +2034,22 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -2067,7 +2067,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma @@ -3700,21 +3700,21 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs0, -48 ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 -; CHECK-NOV-NEXT: lhu s1, 24(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu a1, 16(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -3795,18 +3795,18 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -3835,7 +3835,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -3891,17 +3891,17 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 24(a1) -; CHECK-NOV-NEXT: lhu s3, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 @@ -3965,18 +3965,18 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -4005,7 +4005,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -4058,21 +4058,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs0, -48 ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 -; CHECK-NOV-NEXT: lhu s1, 24(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu a1, 16(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -4145,18 +4145,18 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s2, 24(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma @@ -4185,7 +4185,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -4588,37 +4588,37 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs4, -112 ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 -; CHECK-NOV-NEXT: lhu s1, 56(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu s4, 16(a1) -; CHECK-NOV-NEXT: lhu s5, 24(a1) -; CHECK-NOV-NEXT: lhu s6, 32(a1) -; CHECK-NOV-NEXT: lhu s7, 40(a1) -; CHECK-NOV-NEXT: lhu a1, 48(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu s3, 16(a1) +; CHECK-NOV-NEXT: lhu s4, 24(a1) +; CHECK-NOV-NEXT: lhu s5, 32(a1) +; CHECK-NOV-NEXT: lhu s6, 40(a1) +; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -4769,22 +4769,22 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -4802,7 +4802,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma @@ -4912,33 +4912,33 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu s2, 56(a1) -; CHECK-NOV-NEXT: lhu s3, 48(a1) -; CHECK-NOV-NEXT: lhu s4, 40(a1) -; CHECK-NOV-NEXT: lhu s5, 32(a1) -; CHECK-NOV-NEXT: lhu s6, 24(a1) -; CHECK-NOV-NEXT: lhu s7, 16(a1) -; CHECK-NOV-NEXT: lhu a1, 8(a1) +; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu s2, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) +; CHECK-NOV-NEXT: lhu s4, 32(a1) +; CHECK-NOV-NEXT: lhu s5, 40(a1) +; CHECK-NOV-NEXT: lhu s6, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 ; CHECK-NOV-NEXT: fcvt.lu.s s2, fs6, rtz @@ -5048,22 +5048,22 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -5081,7 +5081,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma @@ -5187,37 +5187,37 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs4, -112 ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 -; CHECK-NOV-NEXT: lhu s1, 56(a1) -; CHECK-NOV-NEXT: lhu s2, 0(a1) -; CHECK-NOV-NEXT: lhu s3, 8(a1) -; CHECK-NOV-NEXT: lhu s4, 16(a1) -; CHECK-NOV-NEXT: lhu s5, 24(a1) -; CHECK-NOV-NEXT: lhu s6, 32(a1) -; CHECK-NOV-NEXT: lhu s7, 40(a1) -; CHECK-NOV-NEXT: lhu a1, 48(a1) +; CHECK-NOV-NEXT: lhu s1, 0(a1) +; CHECK-NOV-NEXT: lhu s2, 8(a1) +; CHECK-NOV-NEXT: lhu s3, 16(a1) +; CHECK-NOV-NEXT: lhu s4, 24(a1) +; CHECK-NOV-NEXT: lhu s5, 32(a1) +; CHECK-NOV-NEXT: lhu s6, 40(a1) +; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu s7, 56(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a1 +; CHECK-NOV-NEXT: fmv.w.x fa0, a2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs6, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s7 +; CHECK-NOV-NEXT: fmv.w.x fa0, s6 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs5, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s6 +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs4, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 +; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs3, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s4 +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs1, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 +; CHECK-NOV-NEXT: fmv.w.x fa0, s7 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz ; CHECK-NOV-NEXT: call __extendhfsf2@plt ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz @@ -5350,22 +5350,22 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 1 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) +; CHECK-V-NEXT: lhu s4, 0(a0) +; CHECK-V-NEXT: lhu a1, 8(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) -; CHECK-V-NEXT: fmv.w.x fa0, a0 +; CHECK-V-NEXT: lhu s6, 24(a0) +; CHECK-V-NEXT: lhu s3, 32(a0) +; CHECK-V-NEXT: lhu s2, 40(a0) +; CHECK-V-NEXT: lhu s1, 48(a0) +; CHECK-V-NEXT: lhu s0, 56(a0) +; CHECK-V-NEXT: fmv.w.x fa0, a1 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma @@ -5383,7 +5383,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll index 485f94ee2a102..c3586095a12f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -162,16 +162,16 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: unaligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -253,9 +253,9 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: lw a3, 8(a1) ; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) ; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-FAST-NEXT: vle32.v v8, (a1) ; RV32-FAST-NEXT: vse32.v v8, (a0) @@ -264,8 +264,8 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-LABEL: unaligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -531,12 +531,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: addi a2, a0, 128 ; RV32-NEXT: vse8.v v8, (a2) ; RV32-NEXT: lbu a2, 195(a1) -; RV32-NEXT: sb a2, 195(a0) -; RV32-NEXT: lbu a2, 194(a1) -; RV32-NEXT: sb a2, 194(a0) -; RV32-NEXT: lbu a2, 193(a1) -; RV32-NEXT: sb a2, 193(a0) +; RV32-NEXT: lbu a3, 194(a1) +; RV32-NEXT: lbu a4, 193(a1) ; RV32-NEXT: lbu a1, 192(a1) +; RV32-NEXT: sb a2, 195(a0) +; RV32-NEXT: sb a3, 194(a0) +; RV32-NEXT: sb a4, 193(a0) ; RV32-NEXT: sb a1, 192(a0) ; RV32-NEXT: ret ; @@ -553,12 +553,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: addi a2, a0, 128 ; RV64-NEXT: vse8.v v8, (a2) ; RV64-NEXT: lbu a2, 195(a1) -; RV64-NEXT: sb a2, 195(a0) -; RV64-NEXT: lbu a2, 194(a1) -; RV64-NEXT: sb a2, 194(a0) -; RV64-NEXT: lbu a2, 193(a1) -; RV64-NEXT: sb a2, 193(a0) +; RV64-NEXT: lbu a3, 194(a1) +; RV64-NEXT: lbu a4, 193(a1) ; RV64-NEXT: lbu a1, 192(a1) +; RV64-NEXT: sb a2, 195(a0) +; RV64-NEXT: sb a3, 194(a0) +; RV64-NEXT: sb a4, 193(a0) ; RV64-NEXT: sb a1, 192(a0) ; RV64-NEXT: ret ; @@ -728,16 +728,16 @@ define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: aligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: aligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: @@ -792,9 +792,9 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-LABEL: aligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry ; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: lw a3, 8(a1) ; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) ; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-FAST-NEXT: vle32.v v8, (a1) ; RV32-FAST-NEXT: vse32.v v8, (a0) @@ -803,8 +803,8 @@ define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-LABEL: aligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll index 65dca0daed8c7..e0c58bc323085 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll @@ -9,20 +9,20 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: lhu s0, 6(a0) +; CHECK-NEXT: lhu s0, 0(a0) +; CHECK-NEXT: lhu a1, 2(a0) ; CHECK-NEXT: lhu s1, 4(a0) -; CHECK-NEXT: lhu s2, 0(a0) -; CHECK-NEXT: lhu a0, 2(a0) -; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: lhu s2, 6(a0) +; CHECK-NEXT: fmv.w.x fa0, a1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fsw fa0, 8(sp) -; CHECK-NEXT: fmv.w.x fa0, s2 +; CHECK-NEXT: fmv.w.x fa0, s0 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fsw fa0, 0(sp) ; CHECK-NEXT: fmv.w.x fa0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fsw fa0, 12(sp) -; CHECK-NEXT: fmv.w.x fa0, s0 +; CHECK-NEXT: fmv.w.x fa0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fsw fa0, 4(sp) ; CHECK-NEXT: addi a0, sp, 8 diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 97121c275a294..2e41622291e0d 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -151,12 +151,19 @@ define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind { define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: lshr128: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a2, 0(a2) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sb zero, 35(sp) +; RV32I-NEXT: sb zero, 34(sp) +; RV32I-NEXT: sb zero, 33(sp) +; RV32I-NEXT: sb zero, 32(sp) ; RV32I-NEXT: sb zero, 31(sp) ; RV32I-NEXT: sb zero, 30(sp) ; RV32I-NEXT: sb zero, 29(sp) @@ -169,94 +176,90 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sb zero, 22(sp) ; RV32I-NEXT: sb zero, 21(sp) ; RV32I-NEXT: sb zero, 20(sp) -; RV32I-NEXT: sb zero, 19(sp) -; RV32I-NEXT: sb zero, 18(sp) -; RV32I-NEXT: sb zero, 17(sp) -; RV32I-NEXT: sb zero, 16(sp) -; RV32I-NEXT: sb a1, 12(sp) -; RV32I-NEXT: sb a5, 8(sp) -; RV32I-NEXT: sb a4, 4(sp) -; RV32I-NEXT: sb a3, 0(sp) +; RV32I-NEXT: sb a1, 16(sp) +; RV32I-NEXT: sb a5, 12(sp) +; RV32I-NEXT: sb a4, 8(sp) +; RV32I-NEXT: sb a3, 4(sp) ; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: sb a6, 15(sp) +; RV32I-NEXT: sb a6, 19(sp) ; RV32I-NEXT: srli a6, a1, 16 -; RV32I-NEXT: sb a6, 14(sp) +; RV32I-NEXT: sb a6, 18(sp) ; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 13(sp) +; RV32I-NEXT: sb a1, 17(sp) ; RV32I-NEXT: srli a1, a5, 24 -; RV32I-NEXT: sb a1, 11(sp) +; RV32I-NEXT: sb a1, 15(sp) ; RV32I-NEXT: srli a1, a5, 16 -; RV32I-NEXT: sb a1, 10(sp) +; RV32I-NEXT: sb a1, 14(sp) ; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 9(sp) +; RV32I-NEXT: sb a5, 13(sp) ; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: sb a1, 7(sp) +; RV32I-NEXT: sb a1, 11(sp) ; RV32I-NEXT: srli a1, a4, 16 -; RV32I-NEXT: sb a1, 6(sp) +; RV32I-NEXT: sb a1, 10(sp) ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 5(sp) +; RV32I-NEXT: sb a4, 9(sp) ; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 3(sp) +; RV32I-NEXT: sb a1, 7(sp) ; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 2(sp) +; RV32I-NEXT: sb a1, 6(sp) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 1(sp) +; RV32I-NEXT: sb a3, 5(sp) ; RV32I-NEXT: slli a1, a2, 25 ; RV32I-NEXT: srli a1, a1, 28 -; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: addi a3, sp, 4 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 2(a1) ; RV32I-NEXT: lbu a6, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a7, 4(a1) +; RV32I-NEXT: lbu t0, 5(a1) +; RV32I-NEXT: lbu t1, 6(a1) +; RV32I-NEXT: lbu t2, 7(a1) +; RV32I-NEXT: lbu t3, 8(a1) +; RV32I-NEXT: lbu t4, 9(a1) +; RV32I-NEXT: lbu t5, 10(a1) +; RV32I-NEXT: lbu t6, 11(a1) +; RV32I-NEXT: lbu s0, 12(a1) +; RV32I-NEXT: lbu s1, 13(a1) +; RV32I-NEXT: lbu s2, 14(a1) +; RV32I-NEXT: lbu a1, 15(a1) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a4, t0, a7 ; RV32I-NEXT: andi a2, a2, 7 ; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: lbu a4, 5(a1) -; RV32I-NEXT: lbu a5, 4(a1) -; RV32I-NEXT: lbu a6, 6(a1) -; RV32I-NEXT: lbu a7, 7(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a5, t2, t1 ; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 ; RV32I-NEXT: xori a6, a2, 31 ; RV32I-NEXT: sll a5, a5, a6 ; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: lbu a5, 9(a1) -; RV32I-NEXT: lbu a7, 8(a1) -; RV32I-NEXT: lbu t0, 10(a1) -; RV32I-NEXT: lbu t1, 11(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a7, t6, t5 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: not t0, a2 -; RV32I-NEXT: lbu t1, 13(a1) ; RV32I-NEXT: sll a7, a7, t0 +; RV32I-NEXT: srl a4, a4, a2 ; RV32I-NEXT: or a4, a4, a7 -; RV32I-NEXT: lbu a7, 12(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: lbu t0, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 ; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: slli a7, a1, 1 ; RV32I-NEXT: sll a6, a7, a6 ; RV32I-NEXT: or a5, a5, a6 @@ -265,7 +268,10 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sw a5, 8(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 48 ; RV32I-NEXT: ret ; ; RV64I-LABEL: lshr128: @@ -293,125 +299,131 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: ashr128: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw a3, 12(a1) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a5, 4(a1) -; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a5, 0(a1) +; RV32I-NEXT: lw a6, 4(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: sb a4, 16(sp) ; RV32I-NEXT: sb a3, 12(sp) -; RV32I-NEXT: sb a4, 8(sp) +; RV32I-NEXT: sb a6, 8(sp) ; RV32I-NEXT: sb a5, 4(sp) -; RV32I-NEXT: sb a1, 0(sp) -; RV32I-NEXT: srai a6, a3, 31 -; RV32I-NEXT: sb a6, 28(sp) -; RV32I-NEXT: sb a6, 24(sp) -; RV32I-NEXT: sb a6, 20(sp) -; RV32I-NEXT: sb a6, 16(sp) -; RV32I-NEXT: srli a7, a3, 24 -; RV32I-NEXT: sb a7, 15(sp) -; RV32I-NEXT: srli a7, a3, 16 -; RV32I-NEXT: sb a7, 14(sp) +; RV32I-NEXT: srai a2, a4, 31 +; RV32I-NEXT: sb a2, 32(sp) +; RV32I-NEXT: sb a2, 28(sp) +; RV32I-NEXT: sb a2, 24(sp) +; RV32I-NEXT: sb a2, 20(sp) +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: sb a7, 19(sp) +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: sb a7, 18(sp) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 17(sp) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(sp) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(sp) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(sp) -; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 11(sp) -; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: srli a3, a6, 16 ; RV32I-NEXT: sb a3, 10(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 9(sp) +; RV32I-NEXT: srli a3, a6, 8 +; RV32I-NEXT: sb a3, 9(sp) ; RV32I-NEXT: srli a3, a5, 24 ; RV32I-NEXT: sb a3, 7(sp) ; RV32I-NEXT: srli a3, a5, 16 ; RV32I-NEXT: sb a3, 6(sp) ; RV32I-NEXT: srli a5, a5, 8 ; RV32I-NEXT: sb a5, 5(sp) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: sb a3, 3(sp) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 2(sp) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(sp) -; RV32I-NEXT: srli a1, a6, 24 -; RV32I-NEXT: sb a1, 31(sp) -; RV32I-NEXT: srli a3, a6, 16 -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a1, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: sb a1, 23(sp) -; RV32I-NEXT: sb a3, 22(sp) -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: sb a1, 19(sp) -; RV32I-NEXT: sb a3, 18(sp) -; RV32I-NEXT: sb a4, 17(sp) -; RV32I-NEXT: slli a1, a2, 25 -; RV32I-NEXT: srli a1, a1, 28 -; RV32I-NEXT: mv a3, sp -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a6, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: srli a3, a2, 24 +; RV32I-NEXT: sb a3, 35(sp) +; RV32I-NEXT: srli a4, a2, 16 +; RV32I-NEXT: sb a4, 34(sp) +; RV32I-NEXT: srli a2, a2, 8 +; RV32I-NEXT: sb a2, 33(sp) +; RV32I-NEXT: sb a3, 31(sp) +; RV32I-NEXT: sb a4, 30(sp) +; RV32I-NEXT: sb a2, 29(sp) +; RV32I-NEXT: sb a3, 27(sp) +; RV32I-NEXT: sb a4, 26(sp) +; RV32I-NEXT: sb a2, 25(sp) +; RV32I-NEXT: sb a3, 23(sp) +; RV32I-NEXT: sb a4, 22(sp) +; RV32I-NEXT: sb a2, 21(sp) +; RV32I-NEXT: slli a2, a1, 25 +; RV32I-NEXT: srli a2, a2, 28 +; RV32I-NEXT: addi a3, sp, 4 +; RV32I-NEXT: add a2, a3, a2 +; RV32I-NEXT: lbu a3, 0(a2) +; RV32I-NEXT: lbu a4, 1(a2) +; RV32I-NEXT: lbu a5, 2(a2) +; RV32I-NEXT: lbu a6, 3(a2) +; RV32I-NEXT: lbu a7, 4(a2) +; RV32I-NEXT: lbu t0, 5(a2) +; RV32I-NEXT: lbu t1, 6(a2) +; RV32I-NEXT: lbu t2, 7(a2) +; RV32I-NEXT: lbu t3, 8(a2) +; RV32I-NEXT: lbu t4, 9(a2) +; RV32I-NEXT: lbu t5, 10(a2) +; RV32I-NEXT: lbu t6, 11(a2) +; RV32I-NEXT: lbu s0, 12(a2) +; RV32I-NEXT: lbu s1, 13(a2) +; RV32I-NEXT: lbu s2, 14(a2) +; RV32I-NEXT: lbu a2, 15(a2) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: andi a2, a2, 7 -; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: lbu a4, 5(a1) -; RV32I-NEXT: lbu a5, 4(a1) -; RV32I-NEXT: lbu a6, 6(a1) -; RV32I-NEXT: lbu a7, 7(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: andi a1, a1, 7 +; RV32I-NEXT: srl a3, a3, a1 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a5, t2, t1 ; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 -; RV32I-NEXT: xori a6, a2, 31 +; RV32I-NEXT: xori a6, a1, 31 ; RV32I-NEXT: sll a5, a5, a6 ; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: lbu a5, 9(a1) -; RV32I-NEXT: lbu a7, 8(a1) -; RV32I-NEXT: lbu t0, 10(a1) -; RV32I-NEXT: lbu t1, 11(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or a7, t6, t5 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: not t0, a2 -; RV32I-NEXT: lbu t1, 13(a1) +; RV32I-NEXT: not t0, a1 ; RV32I-NEXT: sll a7, a7, t0 +; RV32I-NEXT: srl a4, a4, a1 ; RV32I-NEXT: or a4, a4, a7 -; RV32I-NEXT: lbu a7, 12(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: lbu t0, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: slli a7, a1, 1 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli a2, a2, 24 +; RV32I-NEXT: or a2, a2, s2 +; RV32I-NEXT: or a2, a2, s0 +; RV32I-NEXT: slli a7, a2, 1 ; RV32I-NEXT: sll a6, a7, a6 ; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: sra a1, a2, a1 ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: sw a5, 8(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 48 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ashr128: @@ -439,12 +451,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: shl128: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a2, 0(a2) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sb zero, 19(sp) +; RV32I-NEXT: sb zero, 18(sp) +; RV32I-NEXT: sb zero, 17(sp) +; RV32I-NEXT: sb zero, 16(sp) ; RV32I-NEXT: sb zero, 15(sp) ; RV32I-NEXT: sb zero, 14(sp) ; RV32I-NEXT: sb zero, 13(sp) @@ -457,103 +476,102 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sb zero, 6(sp) ; RV32I-NEXT: sb zero, 5(sp) ; RV32I-NEXT: sb zero, 4(sp) -; RV32I-NEXT: sb zero, 3(sp) -; RV32I-NEXT: sb zero, 2(sp) -; RV32I-NEXT: sb zero, 1(sp) -; RV32I-NEXT: sb zero, 0(sp) -; RV32I-NEXT: sb a1, 28(sp) -; RV32I-NEXT: sb a5, 24(sp) -; RV32I-NEXT: sb a4, 20(sp) -; RV32I-NEXT: sb a3, 16(sp) +; RV32I-NEXT: sb a1, 32(sp) +; RV32I-NEXT: sb a5, 28(sp) +; RV32I-NEXT: sb a4, 24(sp) +; RV32I-NEXT: sb a3, 20(sp) ; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: sb a6, 31(sp) +; RV32I-NEXT: sb a6, 35(sp) ; RV32I-NEXT: srli a6, a1, 16 -; RV32I-NEXT: sb a6, 30(sp) +; RV32I-NEXT: sb a6, 34(sp) ; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 29(sp) +; RV32I-NEXT: sb a1, 33(sp) ; RV32I-NEXT: srli a1, a5, 24 -; RV32I-NEXT: sb a1, 27(sp) +; RV32I-NEXT: sb a1, 31(sp) ; RV32I-NEXT: srli a1, a5, 16 -; RV32I-NEXT: sb a1, 26(sp) +; RV32I-NEXT: sb a1, 30(sp) ; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 25(sp) +; RV32I-NEXT: sb a5, 29(sp) ; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: sb a1, 23(sp) +; RV32I-NEXT: sb a1, 27(sp) ; RV32I-NEXT: srli a1, a4, 16 -; RV32I-NEXT: sb a1, 22(sp) +; RV32I-NEXT: sb a1, 26(sp) ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 21(sp) +; RV32I-NEXT: sb a4, 25(sp) ; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 19(sp) +; RV32I-NEXT: sb a1, 23(sp) ; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 18(sp) +; RV32I-NEXT: sb a1, 22(sp) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 17(sp) +; RV32I-NEXT: sb a3, 21(sp) ; RV32I-NEXT: slli a1, a2, 25 ; RV32I-NEXT: srli a1, a1, 28 -; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a1, a3, a1 -; RV32I-NEXT: lbu a3, 5(a1) -; RV32I-NEXT: lbu a4, 4(a1) -; RV32I-NEXT: lbu a5, 6(a1) -; RV32I-NEXT: lbu a6, 7(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: andi a2, a2, 7 -; RV32I-NEXT: sll a4, a3, a2 -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu t0, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: srli a6, a5, 1 -; RV32I-NEXT: xori a7, a2, 31 -; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 9(a1) -; RV32I-NEXT: lbu t0, 8(a1) -; RV32I-NEXT: lbu t1, 10(a1) -; RV32I-NEXT: lbu t2, 11(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, t0 +; RV32I-NEXT: addi a3, sp, 20 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: lbu a1, 0(a3) +; RV32I-NEXT: lbu a4, 1(a3) +; RV32I-NEXT: lbu a5, 2(a3) +; RV32I-NEXT: lbu a6, 3(a3) +; RV32I-NEXT: lbu a7, 4(a3) +; RV32I-NEXT: lbu t0, 5(a3) +; RV32I-NEXT: lbu t1, 6(a3) +; RV32I-NEXT: lbu t2, 7(a3) +; RV32I-NEXT: lbu t3, 8(a3) +; RV32I-NEXT: lbu t4, 9(a3) +; RV32I-NEXT: lbu t5, 10(a3) +; RV32I-NEXT: lbu t6, 11(a3) +; RV32I-NEXT: lbu s0, 12(a3) +; RV32I-NEXT: lbu s1, 13(a3) +; RV32I-NEXT: lbu s2, 14(a3) +; RV32I-NEXT: lbu a3, 15(a3) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: sll t0, a6, a2 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: srl a3, a3, t1 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: lbu t0, 13(a1) -; RV32I-NEXT: lbu t1, 12(a1) -; RV32I-NEXT: lbu t2, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: andi a2, a2, 7 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: sll a4, a7, a2 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: srli a5, a1, 1 +; RV32I-NEXT: xori a6, a2, 31 +; RV32I-NEXT: srl a5, a5, a6 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a5, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: srli a7, a7, 1 +; RV32I-NEXT: not t0, a2 +; RV32I-NEXT: srl a7, a7, t0 +; RV32I-NEXT: sll t0, a5, a2 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a3, a3, s2 +; RV32I-NEXT: or a3, a3, s0 +; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: srl a5, a5, a6 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: sll a1, a1, a2 -; RV32I-NEXT: srli a6, a6, 1 -; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: sll a2, a5, a2 -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 12(a0) -; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a7, 8(a0) ; RV32I-NEXT: sw a4, 4(a0) -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 48 ; RV32I-NEXT: ret ; ; RV64I-LABEL: shl128: @@ -616,10 +634,10 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw t2, 0(a1) ; RV32I-NEXT: lw a2, 0(a2) +; RV32I-NEXT: lw t2, 0(a1) ; RV32I-NEXT: lw a7, 4(a1) +; RV32I-NEXT: lw a3, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: andi t1, a2, 64 ; RV32I-NEXT: mv t0, a7 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 122388c1b73ec..04533659cd146 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -310,22 +310,22 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lbu a0, 12(a0) -; RV32-NEXT: lw a1, 8(s0) -; RV32-NEXT: slli a2, a0, 30 +; RV32-NEXT: lbu a1, 12(a0) +; RV32-NEXT: lw a2, 8(a0) +; RV32-NEXT: lw a0, 0(a0) ; RV32-NEXT: lw a3, 4(s0) -; RV32-NEXT: srli s1, a1, 2 -; RV32-NEXT: or s1, s1, a2 -; RV32-NEXT: slli a2, a1, 31 -; RV32-NEXT: srli a4, a3, 1 -; RV32-NEXT: or s2, a4, a2 -; RV32-NEXT: srli a0, a0, 2 -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai s3, a0, 31 -; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: slli a4, a1, 30 +; RV32-NEXT: srli s1, a2, 2 +; RV32-NEXT: or s1, s1, a4 +; RV32-NEXT: slli a4, a2, 31 +; RV32-NEXT: srli a5, a3, 1 +; RV32-NEXT: or s2, a5, a4 +; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: slli a1, a1, 31 -; RV32-NEXT: lw a0, 0(s0) -; RV32-NEXT: srai s4, a1, 31 +; RV32-NEXT: srai s3, a1, 31 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: slli a2, a2, 31 +; RV32-NEXT: srai s4, a2, 31 ; RV32-NEXT: slli a1, a3, 31 ; RV32-NEXT: srai a1, a1, 31 ; RV32-NEXT: li a2, 6 @@ -391,8 +391,8 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lbu a0, 12(a0) ; RV64-NEXT: lwu a1, 8(s0) -; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: ld a2, 0(s0) +; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: slli a0, a0, 29 ; RV64-NEXT: srai s1, a0, 31 @@ -462,22 +462,22 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32M-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32M-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32M-NEXT: mv s0, a0 -; RV32M-NEXT: lbu a0, 12(a0) -; RV32M-NEXT: lw a1, 8(s0) -; RV32M-NEXT: slli a2, a0, 30 +; RV32M-NEXT: lbu a1, 12(a0) +; RV32M-NEXT: lw a2, 8(a0) +; RV32M-NEXT: lw a0, 0(a0) ; RV32M-NEXT: lw a3, 4(s0) -; RV32M-NEXT: srli s1, a1, 2 -; RV32M-NEXT: or s1, s1, a2 -; RV32M-NEXT: slli a2, a1, 31 -; RV32M-NEXT: srli a4, a3, 1 -; RV32M-NEXT: or s2, a4, a2 -; RV32M-NEXT: srli a0, a0, 2 -; RV32M-NEXT: slli a0, a0, 31 -; RV32M-NEXT: srai s3, a0, 31 -; RV32M-NEXT: srli a1, a1, 1 +; RV32M-NEXT: slli a4, a1, 30 +; RV32M-NEXT: srli s1, a2, 2 +; RV32M-NEXT: or s1, s1, a4 +; RV32M-NEXT: slli a4, a2, 31 +; RV32M-NEXT: srli a5, a3, 1 +; RV32M-NEXT: or s2, a5, a4 +; RV32M-NEXT: srli a1, a1, 2 ; RV32M-NEXT: slli a1, a1, 31 -; RV32M-NEXT: lw a0, 0(s0) -; RV32M-NEXT: srai s4, a1, 31 +; RV32M-NEXT: srai s3, a1, 31 +; RV32M-NEXT: srli a2, a2, 1 +; RV32M-NEXT: slli a2, a2, 31 +; RV32M-NEXT: srai s4, a2, 31 ; RV32M-NEXT: slli a1, a3, 31 ; RV32M-NEXT: srai a1, a1, 31 ; RV32M-NEXT: li a2, 6 @@ -536,34 +536,34 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64M: # %bb.0: ; RV64M-NEXT: ld a1, 0(a0) ; RV64M-NEXT: lwu a2, 8(a0) -; RV64M-NEXT: srli a3, a1, 2 -; RV64M-NEXT: lbu a4, 12(a0) +; RV64M-NEXT: lbu a3, 12(a0) +; RV64M-NEXT: srli a4, a1, 2 ; RV64M-NEXT: slli a5, a2, 62 -; RV64M-NEXT: or a3, a5, a3 -; RV64M-NEXT: srai a3, a3, 31 -; RV64M-NEXT: slli a4, a4, 32 -; RV64M-NEXT: or a2, a2, a4 +; RV64M-NEXT: or a4, a5, a4 +; RV64M-NEXT: srai a4, a4, 31 +; RV64M-NEXT: slli a3, a3, 32 +; RV64M-NEXT: or a2, a2, a3 ; RV64M-NEXT: slli a2, a2, 29 -; RV64M-NEXT: lui a4, %hi(.LCPI3_0) -; RV64M-NEXT: ld a4, %lo(.LCPI3_0)(a4) +; RV64M-NEXT: lui a3, %hi(.LCPI3_0) +; RV64M-NEXT: ld a3, %lo(.LCPI3_0)(a3) ; RV64M-NEXT: srai a2, a2, 31 ; RV64M-NEXT: slli a1, a1, 31 ; RV64M-NEXT: srai a1, a1, 31 -; RV64M-NEXT: mulh a4, a2, a4 -; RV64M-NEXT: srli a5, a4, 63 -; RV64M-NEXT: srai a4, a4, 1 -; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: mulh a3, a2, a3 +; RV64M-NEXT: srli a5, a3, 63 +; RV64M-NEXT: srai a3, a3, 1 +; RV64M-NEXT: add a3, a3, a5 ; RV64M-NEXT: lui a5, %hi(.LCPI3_1) ; RV64M-NEXT: ld a5, %lo(.LCPI3_1)(a5) -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: slli a4, a4, 2 -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: mulh a4, a3, a5 -; RV64M-NEXT: srli a5, a4, 63 -; RV64M-NEXT: srai a4, a4, 1 -; RV64M-NEXT: add a4, a4, a5 -; RV64M-NEXT: slli a5, a4, 3 -; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: add a2, a2, a3 +; RV64M-NEXT: slli a3, a3, 2 +; RV64M-NEXT: add a2, a2, a3 +; RV64M-NEXT: mulh a3, a4, a5 +; RV64M-NEXT: srli a5, a3, 63 +; RV64M-NEXT: srai a3, a3, 1 +; RV64M-NEXT: add a3, a3, a5 +; RV64M-NEXT: slli a5, a3, 3 +; RV64M-NEXT: add a3, a4, a3 ; RV64M-NEXT: sub a3, a3, a5 ; RV64M-NEXT: addi a3, a3, -1 ; RV64M-NEXT: seqz a3, a3 @@ -612,22 +612,22 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: slli a1, a1, 1 ; RV32MV-NEXT: sub sp, sp, a1 ; RV32MV-NEXT: mv s0, a0 -; RV32MV-NEXT: lbu a0, 12(a0) -; RV32MV-NEXT: lw a1, 8(s0) -; RV32MV-NEXT: slli a2, a0, 30 +; RV32MV-NEXT: lbu a1, 12(a0) +; RV32MV-NEXT: lw a2, 8(a0) +; RV32MV-NEXT: lw a0, 0(a0) ; RV32MV-NEXT: lw a3, 4(s0) -; RV32MV-NEXT: srli s1, a1, 2 -; RV32MV-NEXT: or s1, s1, a2 -; RV32MV-NEXT: slli a2, a1, 31 -; RV32MV-NEXT: srli a4, a3, 1 -; RV32MV-NEXT: or s2, a4, a2 -; RV32MV-NEXT: srli a0, a0, 2 -; RV32MV-NEXT: slli a0, a0, 31 -; RV32MV-NEXT: srai s3, a0, 31 -; RV32MV-NEXT: srli a1, a1, 1 +; RV32MV-NEXT: slli a4, a1, 30 +; RV32MV-NEXT: srli s1, a2, 2 +; RV32MV-NEXT: or s1, s1, a4 +; RV32MV-NEXT: slli a4, a2, 31 +; RV32MV-NEXT: srli a5, a3, 1 +; RV32MV-NEXT: or s2, a5, a4 +; RV32MV-NEXT: srli a1, a1, 2 ; RV32MV-NEXT: slli a1, a1, 31 -; RV32MV-NEXT: lw a0, 0(s0) -; RV32MV-NEXT: srai s4, a1, 31 +; RV32MV-NEXT: srai s3, a1, 31 +; RV32MV-NEXT: srli a2, a2, 1 +; RV32MV-NEXT: slli a2, a2, 31 +; RV32MV-NEXT: srai s4, a2, 31 ; RV32MV-NEXT: slli a1, a3, 31 ; RV32MV-NEXT: srai a1, a1, 31 ; RV32MV-NEXT: li a2, 6 @@ -727,46 +727,46 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64MV: # %bb.0: ; RV64MV-NEXT: ld a1, 0(a0) ; RV64MV-NEXT: lwu a2, 8(a0) -; RV64MV-NEXT: srli a3, a1, 2 -; RV64MV-NEXT: lbu a4, 12(a0) +; RV64MV-NEXT: lbu a3, 12(a0) +; RV64MV-NEXT: srli a4, a1, 2 ; RV64MV-NEXT: slli a5, a2, 62 -; RV64MV-NEXT: or a3, a5, a3 -; RV64MV-NEXT: srai a3, a3, 31 -; RV64MV-NEXT: slli a4, a4, 32 -; RV64MV-NEXT: or a2, a2, a4 +; RV64MV-NEXT: or a4, a5, a4 +; RV64MV-NEXT: srai a4, a4, 31 +; RV64MV-NEXT: slli a3, a3, 32 +; RV64MV-NEXT: or a2, a2, a3 ; RV64MV-NEXT: slli a2, a2, 29 -; RV64MV-NEXT: lui a4, %hi(.LCPI3_0) -; RV64MV-NEXT: ld a4, %lo(.LCPI3_0)(a4) +; RV64MV-NEXT: lui a3, %hi(.LCPI3_0) +; RV64MV-NEXT: ld a3, %lo(.LCPI3_0)(a3) ; RV64MV-NEXT: srai a2, a2, 31 ; RV64MV-NEXT: slli a1, a1, 31 ; RV64MV-NEXT: srai a1, a1, 31 -; RV64MV-NEXT: mulh a4, a2, a4 -; RV64MV-NEXT: srli a5, a4, 63 -; RV64MV-NEXT: srai a4, a4, 1 -; RV64MV-NEXT: add a4, a4, a5 +; RV64MV-NEXT: mulh a3, a2, a3 +; RV64MV-NEXT: srli a5, a3, 63 +; RV64MV-NEXT: srai a3, a3, 1 +; RV64MV-NEXT: add a3, a3, a5 ; RV64MV-NEXT: lui a5, %hi(.LCPI3_1) ; RV64MV-NEXT: ld a5, %lo(.LCPI3_1)(a5) -; RV64MV-NEXT: add a2, a2, a4 -; RV64MV-NEXT: slli a4, a4, 2 -; RV64MV-NEXT: add a2, a2, a4 -; RV64MV-NEXT: mulh a4, a3, a5 -; RV64MV-NEXT: srli a5, a4, 63 -; RV64MV-NEXT: srai a4, a4, 1 -; RV64MV-NEXT: add a4, a4, a5 +; RV64MV-NEXT: add a2, a2, a3 +; RV64MV-NEXT: slli a3, a3, 2 +; RV64MV-NEXT: add a2, a2, a3 +; RV64MV-NEXT: mulh a3, a4, a5 +; RV64MV-NEXT: srli a5, a3, 63 +; RV64MV-NEXT: srai a3, a3, 1 +; RV64MV-NEXT: add a3, a3, a5 ; RV64MV-NEXT: lui a5, %hi(.LCPI3_2) ; RV64MV-NEXT: ld a5, %lo(.LCPI3_2)(a5) -; RV64MV-NEXT: add a3, a3, a4 -; RV64MV-NEXT: slli a4, a4, 3 -; RV64MV-NEXT: sub a3, a3, a4 -; RV64MV-NEXT: mulh a4, a1, a5 -; RV64MV-NEXT: srli a5, a4, 63 -; RV64MV-NEXT: add a4, a4, a5 +; RV64MV-NEXT: add a4, a4, a3 +; RV64MV-NEXT: slli a3, a3, 3 +; RV64MV-NEXT: sub a4, a4, a3 +; RV64MV-NEXT: mulh a3, a1, a5 +; RV64MV-NEXT: srli a5, a3, 63 +; RV64MV-NEXT: add a3, a3, a5 ; RV64MV-NEXT: li a5, 6 -; RV64MV-NEXT: mul a4, a4, a5 -; RV64MV-NEXT: sub a1, a1, a4 +; RV64MV-NEXT: mul a3, a3, a5 +; RV64MV-NEXT: sub a1, a1, a3 ; RV64MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64MV-NEXT: vmv.v.x v8, a1 -; RV64MV-NEXT: vslide1down.vx v8, v8, a3 +; RV64MV-NEXT: vslide1down.vx v8, v8, a4 ; RV64MV-NEXT: vslide1down.vx v8, v8, a2 ; RV64MV-NEXT: vslidedown.vi v8, v8, 1 ; RV64MV-NEXT: li a1, -1 diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 3335ca3a34b6c..2d84cbf3e41fd 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -18,29 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh s0, 12(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh s2, 4(a1) ; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh s1, 8(a1) +; RV32I-NEXT: lh s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, -124 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __modsi3@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, -1003 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: sh a0, 6(s3) ; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh s2, 2(s3) +; RV32I-NEXT: sh s0, 2(s3) ; RV32I-NEXT: sh s4, 0(s3) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -53,52 +53,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_srem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: mulh a5, a2, a5 +; RV32IM-NEXT: add a5, a5, a2 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a5 ; RV32IM-NEXT: lui a5, 507375 ; RV32IM-NEXT: addi a5, a5, 1981 -; RV32IM-NEXT: mulh a5, a1, a5 -; RV32IM-NEXT: sub a5, a5, a1 +; RV32IM-NEXT: mulh a5, a3, a5 +; RV32IM-NEXT: sub a5, a5, a3 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: li a6, -124 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sub a3, a3, a5 ; RV32IM-NEXT: lui a5, 342392 ; RV32IM-NEXT: addi a5, a5, 669 -; RV32IM-NEXT: mulh a5, a3, a5 +; RV32IM-NEXT: mulh a5, a4, a5 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 5 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: li a6, 98 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a3, a3, a5 +; RV32IM-NEXT: sub a4, a4, a5 ; RV32IM-NEXT: lui a5, 780943 ; RV32IM-NEXT: addi a5, a5, 1809 -; RV32IM-NEXT: mulh a5, a2, a5 +; RV32IM-NEXT: mulh a5, a1, a5 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 8 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: li a6, -1003 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_1: @@ -110,29 +110,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh s0, 24(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh s2, 8(a1) ; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh s1, 16(a1) +; RV64I-NEXT: lh s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, -124 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __moddi3@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, -1003 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: sh a0, 6(s3) ; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh s2, 2(s3) +; RV64I-NEXT: sh s0, 2(s3) ; RV64I-NEXT: sh s4, 0(s3) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -145,52 +145,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 0(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI0_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI0_0)(a3) -; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: lui a2, %hi(.LCPI0_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64IM-NEXT: lh a3, 0(a1) +; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a5, 16(a1) -; RV64IM-NEXT: lh a1, 8(a1) -; RV64IM-NEXT: mulh a3, a2, a3 -; RV64IM-NEXT: add a3, a3, a2 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: add a3, a3, a6 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mulh a2, a3, a2 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) ; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: mulh a3, a1, a6 -; RV64IM-NEXT: sub a3, a3, a1 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: add a3, a3, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a3, a3, a2 +; RV64IM-NEXT: mulh a2, a4, a6 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_2) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_2)(a6) ; RV64IM-NEXT: li a7, -124 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: mulh a3, a5, a6 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 5 -; RV64IM-NEXT: add a3, a3, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a4, a4, a2 +; RV64IM-NEXT: mulh a2, a5, a6 +; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli a2, a2, 5 +; RV64IM-NEXT: add a2, a2, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_3) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_3)(a6) ; RV64IM-NEXT: li a7, 98 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: mulh a3, a4, a6 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 7 -; RV64IM-NEXT: add a3, a3, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a5, a5, a2 +; RV64IM-NEXT: mulh a2, a1, a6 +; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli a2, a2, 7 +; RV64IM-NEXT: add a2, a2, a6 ; RV64IM-NEXT: li a6, -1003 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a5, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -206,29 +206,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh s0, 12(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh s2, 4(a1) ; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh s1, 8(a1) +; RV32I-NEXT: lh s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __modsi3@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: sh a0, 6(s3) ; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh s2, 2(s3) +; RV32I-NEXT: sh s0, 2(s3) ; RV32I-NEXT: sh s4, 0(s3) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -241,45 +241,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_srem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a6, a4, a5 -; RV32IM-NEXT: add a6, a6, a4 +; RV32IM-NEXT: mulh a6, a2, a5 +; RV32IM-NEXT: add a6, a6, a2 ; RV32IM-NEXT: srli a7, a6, 31 ; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: add a6, a6, a7 ; RV32IM-NEXT: li a7, 95 ; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a4, a4, a6 -; RV32IM-NEXT: mulh a6, a1, a5 -; RV32IM-NEXT: add a6, a6, a1 +; RV32IM-NEXT: sub a2, a2, a6 +; RV32IM-NEXT: mulh a6, a3, a5 +; RV32IM-NEXT: add a6, a6, a3 ; RV32IM-NEXT: srli t0, a6, 31 ; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: add a6, a6, t0 ; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a1, a1, a6 -; RV32IM-NEXT: mulh a6, a3, a5 -; RV32IM-NEXT: add a6, a6, a3 +; RV32IM-NEXT: sub a3, a3, a6 +; RV32IM-NEXT: mulh a6, a4, a5 +; RV32IM-NEXT: add a6, a6, a4 ; RV32IM-NEXT: srli t0, a6, 31 ; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: add a6, a6, t0 ; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a3, a3, a6 -; RV32IM-NEXT: mulh a5, a2, a5 -; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: sub a4, a4, a6 +; RV32IM-NEXT: mulh a5, a1, a5 +; RV32IM-NEXT: add a5, a5, a1 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: mul a5, a5, a7 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_2: @@ -291,29 +291,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh s0, 24(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh s2, 8(a1) ; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh s1, 16(a1) +; RV64I-NEXT: lh s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __moddi3@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: sh a0, 6(s3) ; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh s2, 2(s3) +; RV64I-NEXT: sh s0, 2(s3) ; RV64I-NEXT: sh s4, 0(s3) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -326,45 +326,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 0(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI1_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI1_0)(a3) -; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: lui a2, %hi(.LCPI1_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI1_0)(a2) +; RV64IM-NEXT: lh a3, 0(a1) +; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a5, 16(a1) -; RV64IM-NEXT: lh a1, 8(a1) -; RV64IM-NEXT: mulh a6, a2, a3 -; RV64IM-NEXT: add a6, a6, a2 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mulh a6, a3, a2 +; RV64IM-NEXT: add a6, a6, a3 ; RV64IM-NEXT: srli a7, a6, 63 ; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: add a6, a6, a7 ; RV64IM-NEXT: li a7, 95 ; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: mulh a6, a1, a3 -; RV64IM-NEXT: add a6, a6, a1 +; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: mulh a6, a4, a2 +; RV64IM-NEXT: add a6, a6, a4 ; RV64IM-NEXT: srli t0, a6, 63 ; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: add a6, a6, t0 ; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a1, a1, a6 -; RV64IM-NEXT: mulh a6, a5, a3 +; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: mulh a6, a5, a2 ; RV64IM-NEXT: add a6, a6, a5 ; RV64IM-NEXT: srli t0, a6, 63 ; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: add a6, a6, t0 ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: subw a5, a5, a6 -; RV64IM-NEXT: mulh a3, a4, a3 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a6, a3, 63 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: add a3, a3, a6 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: mulh a2, a1, a2 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a6, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a5, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -445,14 +445,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 0(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lh a3, 0(a1) +; RV32IM-NEXT: lh a4, 4(a1) ; RV32IM-NEXT: lh a1, 8(a1) ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a6, a4, a5 -; RV32IM-NEXT: add a6, a6, a4 +; RV32IM-NEXT: mulh a6, a2, a5 +; RV32IM-NEXT: add a6, a6, a2 ; RV32IM-NEXT: srli a7, a6, 31 ; RV32IM-NEXT: srai a6, a6, 6 ; RV32IM-NEXT: add a6, a6, a7 @@ -464,30 +464,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV32IM-NEXT: srai t1, t1, 6 ; RV32IM-NEXT: add t1, t1, t2 ; RV32IM-NEXT: mul t2, t1, a7 -; RV32IM-NEXT: mulh t3, a3, a5 -; RV32IM-NEXT: add t3, t3, a3 +; RV32IM-NEXT: mulh t3, a4, a5 +; RV32IM-NEXT: add t3, t3, a4 ; RV32IM-NEXT: srli t4, t3, 31 ; RV32IM-NEXT: srai t3, t3, 6 ; RV32IM-NEXT: add t3, t3, t4 ; RV32IM-NEXT: mul t4, t3, a7 -; RV32IM-NEXT: mulh a5, a2, a5 -; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: mulh a5, a3, a5 +; RV32IM-NEXT: add a5, a5, a3 ; RV32IM-NEXT: srli t5, a5, 31 ; RV32IM-NEXT: srai a5, a5, 6 ; RV32IM-NEXT: add a5, a5, t5 ; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: sub a2, a2, a7 -; RV32IM-NEXT: add a3, a3, t3 -; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: sub a3, a3, a7 +; RV32IM-NEXT: add a4, a4, t3 +; RV32IM-NEXT: sub a4, a4, t4 ; RV32IM-NEXT: add a1, a1, t1 ; RV32IM-NEXT: sub a1, a1, t2 -; RV32IM-NEXT: add a4, a4, a6 -; RV32IM-NEXT: sub a4, a4, t0 -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: add a2, a2, a6 +; RV32IM-NEXT: sub a2, a2, t0 +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a3, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -624,21 +624,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: lh a3, 4(a1) +; RV32I-NEXT: lh a4, 8(a1) ; RV32I-NEXT: lh a0, 12(a1) -; RV32I-NEXT: lh a3, 8(a1) -; RV32I-NEXT: lh a1, 4(a1) -; RV32I-NEXT: srli a4, a2, 26 -; RV32I-NEXT: add a4, a2, a4 -; RV32I-NEXT: andi a4, a4, -64 -; RV32I-NEXT: sub s1, a2, a4 -; RV32I-NEXT: srli a2, a1, 27 -; RV32I-NEXT: add a2, a1, a2 -; RV32I-NEXT: andi a2, a2, -32 -; RV32I-NEXT: sub s2, a1, a2 -; RV32I-NEXT: srli a1, a3, 29 +; RV32I-NEXT: srli a1, a2, 26 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: andi a1, a1, -64 +; RV32I-NEXT: sub s1, a2, a1 +; RV32I-NEXT: srli a1, a3, 27 ; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: andi a1, a1, -32 +; RV32I-NEXT: sub s2, a3, a1 +; RV32I-NEXT: srli a1, a4, 29 +; RV32I-NEXT: add a1, a4, a1 ; RV32I-NEXT: andi a1, a1, -8 -; RV32I-NEXT: sub s3, a3, a1 +; RV32I-NEXT: sub s3, a4, a1 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: sh a0, 6(s0) @@ -655,36 +655,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_srem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 8(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 0(a1) +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lh a3, 0(a1) +; RV32IM-NEXT: lh a4, 4(a1) +; RV32IM-NEXT: lh a1, 8(a1) ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: mulh a5, a2, a5 +; RV32IM-NEXT: add a5, a5, a2 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: srli a5, a1, 26 -; RV32IM-NEXT: add a5, a1, a5 -; RV32IM-NEXT: andi a5, a5, -64 -; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: srli a5, a3, 27 +; RV32IM-NEXT: sub a2, a2, a5 +; RV32IM-NEXT: srli a5, a3, 26 ; RV32IM-NEXT: add a5, a3, a5 -; RV32IM-NEXT: andi a5, a5, -32 +; RV32IM-NEXT: andi a5, a5, -64 ; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: srli a5, a2, 29 -; RV32IM-NEXT: add a5, a2, a5 +; RV32IM-NEXT: srli a5, a4, 27 +; RV32IM-NEXT: add a5, a4, a5 +; RV32IM-NEXT: andi a5, a5, -32 +; RV32IM-NEXT: sub a4, a4, a5 +; RV32IM-NEXT: srli a5, a1, 29 +; RV32IM-NEXT: add a5, a1, a5 ; RV32IM-NEXT: andi a5, a5, -8 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a3, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_power_of_two: @@ -697,21 +697,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: lh a3, 8(a1) +; RV64I-NEXT: lh a4, 16(a1) ; RV64I-NEXT: lh a0, 24(a1) -; RV64I-NEXT: lh a3, 16(a1) -; RV64I-NEXT: lh a1, 8(a1) -; RV64I-NEXT: srli a4, a2, 58 -; RV64I-NEXT: add a4, a2, a4 -; RV64I-NEXT: andi a4, a4, -64 -; RV64I-NEXT: subw s1, a2, a4 -; RV64I-NEXT: srli a2, a1, 59 -; RV64I-NEXT: add a2, a1, a2 -; RV64I-NEXT: andi a2, a2, -32 -; RV64I-NEXT: subw s2, a1, a2 -; RV64I-NEXT: srli a1, a3, 61 +; RV64I-NEXT: srli a1, a2, 58 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: andi a1, a1, -64 +; RV64I-NEXT: subw s1, a2, a1 +; RV64I-NEXT: srli a1, a3, 59 ; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: andi a1, a1, -32 +; RV64I-NEXT: subw s2, a3, a1 +; RV64I-NEXT: srli a1, a4, 61 +; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: andi a1, a1, -8 -; RV64I-NEXT: subw s3, a3, a1 +; RV64I-NEXT: subw s3, a4, a1 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: sh a0, 6(s0) @@ -731,9 +731,9 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lh a2, 24(a1) ; RV64IM-NEXT: lui a3, %hi(.LCPI3_0) ; RV64IM-NEXT: ld a3, %lo(.LCPI3_0)(a3) -; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: lh a4, 0(a1) ; RV64IM-NEXT: lh a5, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: lh a1, 16(a1) ; RV64IM-NEXT: mulh a3, a2, a3 ; RV64IM-NEXT: add a3, a3, a2 ; RV64IM-NEXT: srli a6, a3, 63 @@ -742,21 +742,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: li a6, 95 ; RV64IM-NEXT: mul a3, a3, a6 ; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: srli a3, a1, 58 -; RV64IM-NEXT: add a3, a1, a3 +; RV64IM-NEXT: srli a3, a4, 58 +; RV64IM-NEXT: add a3, a4, a3 ; RV64IM-NEXT: andi a3, a3, -64 -; RV64IM-NEXT: subw a1, a1, a3 +; RV64IM-NEXT: subw a4, a4, a3 ; RV64IM-NEXT: srli a3, a5, 59 ; RV64IM-NEXT: add a3, a5, a3 ; RV64IM-NEXT: andi a3, a3, -32 ; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: srli a3, a4, 61 -; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: srli a3, a1, 61 +; RV64IM-NEXT: add a3, a1, a3 ; RV64IM-NEXT: andi a3, a3, -8 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: subw a1, a1, a3 +; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: sh a5, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a4, 0(a0) ; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, @@ -773,24 +773,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh s0, 12(a1) -; RV32I-NEXT: lh s1, 8(a1) ; RV32I-NEXT: lh a2, 4(a1) +; RV32I-NEXT: lh s0, 8(a1) +; RV32I-NEXT: lh s1, 12(a1) ; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: li a1, 654 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __modsi3@plt -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3@plt ; RV32I-NEXT: sh a0, 6(s2) -; RV32I-NEXT: sh s1, 4(s2) +; RV32I-NEXT: sh s0, 4(s2) ; RV32I-NEXT: sh s3, 2(s2) ; RV32I-NEXT: sh zero, 0(s2) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -803,43 +803,43 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_srem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lh a2, 4(a1) +; RV32IM-NEXT: lh a3, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a4, 820904 ; RV32IM-NEXT: addi a4, a4, -1903 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: mulh a4, a2, a4 +; RV32IM-NEXT: add a4, a4, a2 ; RV32IM-NEXT: srli a5, a4, 31 ; RV32IM-NEXT: srli a4, a4, 9 ; RV32IM-NEXT: add a4, a4, a5 ; RV32IM-NEXT: li a5, 654 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: lui a4, 729444 ; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: add a4, a4, a3 ; RV32IM-NEXT: srli a5, a4, 31 ; RV32IM-NEXT: srli a4, a4, 4 ; RV32IM-NEXT: add a4, a4, a5 ; RV32IM-NEXT: li a5, 23 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sub a3, a3, a4 ; RV32IM-NEXT: lui a4, 395996 ; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a2, a4 +; RV32IM-NEXT: mulh a4, a1, a4 ; RV32IM-NEXT: srli a5, a4, 31 ; RV32IM-NEXT: srli a4, a4, 11 ; RV32IM-NEXT: add a4, a4, a5 ; RV32IM-NEXT: lui a5, 1 ; RV32IM-NEXT: addi a5, a5, 1327 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sub a1, a1, a4 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_one: @@ -850,24 +850,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh s0, 24(a1) -; RV64I-NEXT: lh s1, 16(a1) ; RV64I-NEXT: lh a2, 8(a1) +; RV64I-NEXT: lh s0, 16(a1) +; RV64I-NEXT: lh s1, 24(a1) ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: li a1, 654 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __moddi3@plt -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: sh a0, 6(s2) -; RV64I-NEXT: sh s1, 4(s2) +; RV64I-NEXT: sh s0, 4(s2) ; RV64I-NEXT: sh s3, 2(s2) ; RV64I-NEXT: sh zero, 0(s2) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -880,42 +880,42 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_srem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 16(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI4_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI4_0)(a3) -; RV64IM-NEXT: lh a4, 24(a1) -; RV64IM-NEXT: lh a1, 8(a1) -; RV64IM-NEXT: mulh a3, a2, a3 -; RV64IM-NEXT: add a3, a3, a2 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srli a3, a3, 4 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: lui a2, %hi(.LCPI4_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI4_0)(a2) +; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mulh a2, a3, a2 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 4 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_1) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_1)(a5) ; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: mulh a3, a1, a5 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srli a3, a3, 8 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, a2 +; RV64IM-NEXT: mulh a2, a4, a5 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 8 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) ; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: mulh a3, a4, a5 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srli a3, a3, 11 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a4, a4, a2 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 11 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: subw a4, a4, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: subw a1, a1, a2 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -933,8 +933,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lh a2, 4(a1) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lh s1, 12(a1) ; RV32I-NEXT: lh a0, 8(a1) +; RV32I-NEXT: lh s1, 12(a1) ; RV32I-NEXT: srli a1, a2, 17 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: lui a3, 8 @@ -1005,8 +1005,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lh a2, 8(a1) ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lh s1, 24(a1) ; RV64I-NEXT: lh a0, 16(a1) +; RV64I-NEXT: lh s1, 24(a1) ; RV64I-NEXT: srli a1, a2, 49 ; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: lui a3, 8 @@ -1033,38 +1033,38 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_i16_smax: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 16(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI5_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI5_0)(a3) -; RV64IM-NEXT: lh a4, 24(a1) -; RV64IM-NEXT: mulh a3, a2, a3 -; RV64IM-NEXT: add a3, a3, a2 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srli a3, a3, 4 -; RV64IM-NEXT: add a3, a3, a5 -; RV64IM-NEXT: li a5, 23 -; RV64IM-NEXT: lui a6, %hi(.LCPI5_1) -; RV64IM-NEXT: ld a6, %lo(.LCPI5_1)(a6) -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: lh a1, 8(a1) -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: mulh a3, a4, a6 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srli a3, a3, 11 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: lui a2, %hi(.LCPI5_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI5_0)(a2) +; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mulh a2, a3, a2 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 4 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: lui a5, %hi(.LCPI5_1) +; RV64IM-NEXT: ld a5, %lo(.LCPI5_1)(a5) +; RV64IM-NEXT: li a6, 23 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, a2 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 11 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: srli a3, a1, 49 -; RV64IM-NEXT: add a3, a1, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: srli a2, a4, 49 +; RV64IM-NEXT: add a2, a4, a2 ; RV64IM-NEXT: lui a5, 8 -; RV64IM-NEXT: and a3, a3, a5 -; RV64IM-NEXT: subw a1, a1, a3 +; RV64IM-NEXT: and a2, a2, a5 +; RV64IM-NEXT: subw a4, a4, a2 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1085,49 +1085,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: lw s0, 0(a1) +; RV32I-NEXT: lw s1, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s3, 12(a1) +; RV32I-NEXT: lw a3, 16(a1) +; RV32I-NEXT: lw a4, 20(a1) +; RV32I-NEXT: lw s4, 24(a1) +; RV32I-NEXT: lw s5, 28(a1) ; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: li a2, 1 +; RV32I-NEXT: li a2, 23 ; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 -; RV32I-NEXT: li a2, 654 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 ; RV32I-NEXT: mv a0, s4 ; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: mv s5, a1 -; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: li a2, 654 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: call __moddi3@plt +; RV32I-NEXT: sw s5, 28(s6) +; RV32I-NEXT: sw s4, 24(s6) +; RV32I-NEXT: sw s8, 20(s6) +; RV32I-NEXT: sw s7, 16(s6) +; RV32I-NEXT: sw a1, 12(s6) +; RV32I-NEXT: sw a0, 8(s6) +; RV32I-NEXT: sw s1, 4(s6) +; RV32I-NEXT: sw s0, 0(s6) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1154,49 +1155,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw s0, 0(a1) +; RV32IM-NEXT: lw s1, 4(a1) +; RV32IM-NEXT: lw s2, 8(a1) +; RV32IM-NEXT: lw s3, 12(a1) +; RV32IM-NEXT: lw a3, 16(a1) +; RV32IM-NEXT: lw a4, 20(a1) +; RV32IM-NEXT: lw s4, 24(a1) +; RV32IM-NEXT: lw s5, 28(a1) ; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: li a2, 1 +; RV32IM-NEXT: li a2, 23 ; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a1, a4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: li a2, 654 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 ; RV32IM-NEXT: mv a0, s4 ; RV32IM-NEXT: mv a1, s5 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt ; RV32IM-NEXT: mv s4, a0 ; RV32IM-NEXT: mv s5, a1 -; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, s0 ; RV32IM-NEXT: mv a1, s1 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: li a2, 654 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: li a3, 0 +; RV32IM-NEXT: call __moddi3@plt +; RV32IM-NEXT: sw s5, 28(s6) +; RV32IM-NEXT: sw s4, 24(s6) +; RV32IM-NEXT: sw s8, 20(s6) +; RV32IM-NEXT: sw s7, 16(s6) +; RV32IM-NEXT: sw a1, 12(s6) +; RV32IM-NEXT: sw a0, 8(s6) +; RV32IM-NEXT: sw s1, 4(s6) +; RV32IM-NEXT: sw s0, 0(s6) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1218,24 +1220,24 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: ld s0, 24(a1) -; RV64I-NEXT: ld s1, 16(a1) ; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: ld s0, 16(a1) +; RV64I-NEXT: ld s1, 24(a1) ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: li a1, 654 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __moddi3@plt -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3@plt ; RV64I-NEXT: sd a0, 24(s2) -; RV64I-NEXT: sd s1, 16(s2) +; RV64I-NEXT: sd s0, 16(s2) ; RV64I-NEXT: sd s3, 8(s2) ; RV64I-NEXT: sd zero, 0(s2) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -1248,42 +1250,42 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_srem_i64: ; RV64IM: # %bb.0: -; RV64IM-NEXT: ld a2, 16(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI6_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI6_0)(a3) -; RV64IM-NEXT: ld a4, 24(a1) -; RV64IM-NEXT: ld a1, 8(a1) -; RV64IM-NEXT: mulh a3, a2, a3 -; RV64IM-NEXT: add a3, a3, a2 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srai a3, a3, 4 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: lui a2, %hi(.LCPI6_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64IM-NEXT: ld a3, 16(a1) +; RV64IM-NEXT: ld a4, 8(a1) +; RV64IM-NEXT: ld a1, 24(a1) +; RV64IM-NEXT: mulh a2, a3, a2 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srai a2, a2, 4 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI6_1) ; RV64IM-NEXT: ld a5, %lo(.LCPI6_1)(a5) ; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: sub a2, a2, a3 -; RV64IM-NEXT: mulh a3, a1, a5 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srai a3, a3, 8 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: mulh a2, a4, a5 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srai a2, a2, 8 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI6_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5) ; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: sub a1, a1, a3 -; RV64IM-NEXT: mulh a3, a4, a5 -; RV64IM-NEXT: srli a5, a3, 63 -; RV64IM-NEXT: srai a3, a3, 11 -; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, a2 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srai a2, a2, 11 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: sub a4, a4, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sd zero, 0(a0) -; RV64IM-NEXT: sd a4, 24(a0) -; RV64IM-NEXT: sd a1, 8(a0) -; RV64IM-NEXT: sd a2, 16(a0) +; RV64IM-NEXT: sd a1, 24(a0) +; RV64IM-NEXT: sd a4, 8(a0) +; RV64IM-NEXT: sd a3, 16(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index 651df94bab496..2b2ba675b29f2 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -143,15 +143,15 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 392 ; CHECK-NEXT: sw a3, 392(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw a0, 424(sp) +; CHECK-NEXT: lw a0, 432(sp) ; CHECK-NEXT: lw a1, 436(sp) -; CHECK-NEXT: lw a2, 432(sp) +; CHECK-NEXT: lw a2, 424(sp) ; CHECK-NEXT: lw a3, 428(sp) ; CHECK-NEXT: lui a4, %hi(X) ; CHECK-NEXT: sw a1, %lo(X+12)(a4) -; CHECK-NEXT: sw a2, %lo(X+8)(a4) +; CHECK-NEXT: sw a0, %lo(X+8)(a4) ; CHECK-NEXT: sw a3, %lo(X+4)(a4) -; CHECK-NEXT: sw a0, %lo(X)(a4) +; CHECK-NEXT: sw a2, %lo(X)(a4) ; CHECK-NEXT: lw s8, 4(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw s8, 212(sp) ; CHECK-NEXT: lw s4, 8(sp) # 4-byte Folded Reload @@ -190,15 +190,15 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 344 ; CHECK-NEXT: sw s9, 360(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw a0, 376(sp) +; CHECK-NEXT: lw a0, 384(sp) ; CHECK-NEXT: lw a1, 388(sp) -; CHECK-NEXT: lw a2, 384(sp) +; CHECK-NEXT: lw a2, 376(sp) ; CHECK-NEXT: lw a3, 380(sp) ; CHECK-NEXT: lui a4, %hi(S) ; CHECK-NEXT: sw a1, %lo(S+12)(a4) -; CHECK-NEXT: sw a2, %lo(S+8)(a4) +; CHECK-NEXT: sw a0, %lo(S+8)(a4) ; CHECK-NEXT: sw a3, %lo(S+4)(a4) -; CHECK-NEXT: sw a0, %lo(S)(a4) +; CHECK-NEXT: sw a2, %lo(S)(a4) ; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 260(sp) ; CHECK-NEXT: sw s10, 256(sp) @@ -216,15 +216,15 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: lw a3, 44(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a3, 264(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw a0, 280(sp) +; CHECK-NEXT: lw a0, 288(sp) ; CHECK-NEXT: lw a1, 292(sp) -; CHECK-NEXT: lw a2, 288(sp) +; CHECK-NEXT: lw a2, 280(sp) ; CHECK-NEXT: lw a3, 284(sp) ; CHECK-NEXT: lui a4, %hi(T) ; CHECK-NEXT: sw a1, %lo(T+12)(a4) -; CHECK-NEXT: sw a2, %lo(T+8)(a4) +; CHECK-NEXT: sw a0, %lo(T+8)(a4) ; CHECK-NEXT: sw a3, %lo(T+4)(a4) -; CHECK-NEXT: sw a0, %lo(T)(a4) +; CHECK-NEXT: sw a2, %lo(T)(a4) ; CHECK-NEXT: sw zero, 164(sp) ; CHECK-NEXT: sw zero, 160(sp) ; CHECK-NEXT: sw zero, 156(sp) @@ -238,15 +238,15 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 152 ; CHECK-NEXT: sw s1, 168(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw a0, 184(sp) +; CHECK-NEXT: lw a0, 192(sp) ; CHECK-NEXT: lw a1, 196(sp) -; CHECK-NEXT: lw a2, 192(sp) +; CHECK-NEXT: lw a2, 184(sp) ; CHECK-NEXT: lw a3, 188(sp) ; CHECK-NEXT: lui a4, %hi(Y) ; CHECK-NEXT: sw a1, %lo(Y+12)(a4) -; CHECK-NEXT: sw a2, %lo(Y+8)(a4) +; CHECK-NEXT: sw a0, %lo(Y+8)(a4) ; CHECK-NEXT: sw a3, %lo(Y+4)(a4) -; CHECK-NEXT: sw a0, %lo(Y)(a4) +; CHECK-NEXT: sw a2, %lo(Y)(a4) ; CHECK-NEXT: sw zero, 116(sp) ; CHECK-NEXT: sw zero, 112(sp) ; CHECK-NEXT: sw zero, 108(sp) diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll index 8c0d97afe6c21..1ce388fc7ac4a 100644 --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RISCV32-NEXT: lw a3, 12(a1) -; RISCV32-NEXT: lw a7, 12(a2) -; RISCV32-NEXT: lw a6, 8(a1) -; RISCV32-NEXT: lw a4, 0(a2) -; RISCV32-NEXT: lw a5, 0(a1) +; RISCV32-NEXT: lw a3, 0(a2) +; RISCV32-NEXT: lw a7, 4(a2) +; RISCV32-NEXT: lw a4, 8(a2) +; RISCV32-NEXT: lw a5, 12(a2) +; RISCV32-NEXT: lw a2, 0(a1) ; RISCV32-NEXT: lw t2, 4(a1) -; RISCV32-NEXT: lw t0, 8(a2) -; RISCV32-NEXT: lw a2, 4(a2) -; RISCV32-NEXT: mulhu a1, a5, a4 -; RISCV32-NEXT: mul t1, t2, a4 -; RISCV32-NEXT: add a1, t1, a1 -; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t3, t2, a4 +; RISCV32-NEXT: lw a6, 8(a1) +; RISCV32-NEXT: lw a1, 12(a1) +; RISCV32-NEXT: mulhu t0, a2, a3 +; RISCV32-NEXT: mul t1, t2, a3 +; RISCV32-NEXT: add t0, t1, t0 +; RISCV32-NEXT: sltu t1, t0, t1 +; RISCV32-NEXT: mulhu t3, t2, a3 ; RISCV32-NEXT: add t4, t3, t1 -; RISCV32-NEXT: mul t1, a5, a2 -; RISCV32-NEXT: add a1, t1, a1 -; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t3, a5, a2 +; RISCV32-NEXT: mul t1, a2, a7 +; RISCV32-NEXT: add t0, t1, t0 +; RISCV32-NEXT: sltu t1, t0, t1 +; RISCV32-NEXT: mulhu t3, a2, a7 ; RISCV32-NEXT: add t1, t3, t1 ; RISCV32-NEXT: add t5, t4, t1 -; RISCV32-NEXT: mul t6, t2, a2 +; RISCV32-NEXT: mul t6, t2, a7 ; RISCV32-NEXT: add s0, t6, t5 -; RISCV32-NEXT: mul t1, t0, a5 -; RISCV32-NEXT: mul s3, a6, a4 +; RISCV32-NEXT: mul t1, a4, a2 +; RISCV32-NEXT: mul s3, a6, a3 ; RISCV32-NEXT: add s4, s3, t1 ; RISCV32-NEXT: add t1, s0, s4 ; RISCV32-NEXT: sltu t3, t1, s0 ; RISCV32-NEXT: sltu s0, s0, t6 ; RISCV32-NEXT: sltu t4, t5, t4 -; RISCV32-NEXT: mulhu t5, t2, a2 +; RISCV32-NEXT: mulhu t5, t2, a7 ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: add s0, t4, s0 -; RISCV32-NEXT: mul t4, t2, t0 -; RISCV32-NEXT: mul t5, a7, a5 +; RISCV32-NEXT: mul t4, t2, a4 +; RISCV32-NEXT: mul t5, a5, a2 ; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu s1, t0, a5 +; RISCV32-NEXT: mulhu s1, a4, a2 ; RISCV32-NEXT: add s2, s1, t4 -; RISCV32-NEXT: mul t4, a2, a6 -; RISCV32-NEXT: mul t5, a3, a4 +; RISCV32-NEXT: mul t4, a7, a6 +; RISCV32-NEXT: mul t5, a1, a3 ; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: mulhu t5, a6, a4 +; RISCV32-NEXT: mulhu t5, a6, a3 ; RISCV32-NEXT: add t6, t5, t4 ; RISCV32-NEXT: add t4, t6, s2 ; RISCV32-NEXT: sltu s3, s4, s3 @@ -63,41 +63,41 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: .LBB0_2: # %start ; RISCV32-NEXT: sltu s0, s2, s1 ; RISCV32-NEXT: snez s1, t2 -; RISCV32-NEXT: snez s2, a7 +; RISCV32-NEXT: snez s2, a5 ; RISCV32-NEXT: and s1, s2, s1 -; RISCV32-NEXT: mulhu s2, a7, a5 +; RISCV32-NEXT: mulhu s2, a5, a2 ; RISCV32-NEXT: snez s2, s2 ; RISCV32-NEXT: or s1, s1, s2 -; RISCV32-NEXT: mulhu t2, t2, t0 +; RISCV32-NEXT: mulhu t2, t2, a4 ; RISCV32-NEXT: snez t2, t2 ; RISCV32-NEXT: or t2, s1, t2 ; RISCV32-NEXT: or t2, t2, s0 ; RISCV32-NEXT: sltu t5, t6, t5 -; RISCV32-NEXT: snez t6, a2 -; RISCV32-NEXT: snez s0, a3 +; RISCV32-NEXT: snez t6, a7 +; RISCV32-NEXT: snez s0, a1 ; RISCV32-NEXT: and t6, s0, t6 -; RISCV32-NEXT: mulhu s0, a3, a4 +; RISCV32-NEXT: mulhu s0, a1, a3 ; RISCV32-NEXT: snez s0, s0 ; RISCV32-NEXT: or t6, t6, s0 -; RISCV32-NEXT: mulhu a2, a2, a6 -; RISCV32-NEXT: snez a2, a2 -; RISCV32-NEXT: or a2, t6, a2 -; RISCV32-NEXT: or a2, a2, t5 -; RISCV32-NEXT: or a7, t0, a7 +; RISCV32-NEXT: mulhu a7, a7, a6 ; RISCV32-NEXT: snez a7, a7 -; RISCV32-NEXT: or a3, a6, a3 -; RISCV32-NEXT: snez a3, a3 -; RISCV32-NEXT: and a3, a3, a7 -; RISCV32-NEXT: or a2, a3, a2 -; RISCV32-NEXT: or a3, t2, t3 -; RISCV32-NEXT: or a2, a2, a3 -; RISCV32-NEXT: mul a3, a5, a4 -; RISCV32-NEXT: andi a2, a2, 1 -; RISCV32-NEXT: sw a3, 0(a0) -; RISCV32-NEXT: sw a1, 4(a0) +; RISCV32-NEXT: or a7, t6, a7 +; RISCV32-NEXT: or a7, a7, t5 +; RISCV32-NEXT: or a4, a4, a5 +; RISCV32-NEXT: snez a4, a4 +; RISCV32-NEXT: or a1, a6, a1 +; RISCV32-NEXT: snez a1, a1 +; RISCV32-NEXT: and a1, a1, a4 +; RISCV32-NEXT: or a1, a1, a7 +; RISCV32-NEXT: or a4, t2, t3 +; RISCV32-NEXT: or a1, a1, a4 +; RISCV32-NEXT: mul a2, a2, a3 +; RISCV32-NEXT: andi a1, a1, 1 +; RISCV32-NEXT: sw a2, 0(a0) +; RISCV32-NEXT: sw t0, 4(a0) ; RISCV32-NEXT: sw t1, 8(a0) ; RISCV32-NEXT: sw t4, 12(a0) -; RISCV32-NEXT: sb a2, 16(a0) +; RISCV32-NEXT: sb a1, 16(a0) ; RISCV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload ; RISCV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index 599b0d08629ea..f2a7008728db3 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -85,50 +85,49 @@ define i32 @load_i32(ptr %p) { define i64 @load_i64(ptr %p) { ; RV32I-LABEL: load_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 0(a0) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a2, 1(a0) ; RV32I-NEXT: lbu a3, 2(a0) ; RV32I-NEXT: lbu a4, 3(a0) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 5(a0) +; RV32I-NEXT: lbu a7, 6(a0) +; RV32I-NEXT: lbu t0, 7(a0) +; RV32I-NEXT: slli a2, a2, 8 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a2, a4, a3 -; RV32I-NEXT: or a2, a2, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a4 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: or a0, a4, a3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a2, t0, a7 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: load_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 0(a0) +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: lbu a2, 1(a0) ; RV64I-NEXT: lbu a3, 2(a0) ; RV64I-NEXT: lbu a4, 3(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a2, a2, 8 +; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a4, a4, 24 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: lbu a2, 5(a0) -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: lbu a4, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a2, a2, 8 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a2, a6, a5 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 32aca29d16e9b..228fb308ab44d 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -19,29 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s0, 12(a1) -; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu s2, 4(a1) ; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 124 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __umodsi3@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, 1003 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: sh a0, 6(s3) ; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh s2, 2(s3) +; RV32I-NEXT: sh s0, 2(s3) ; RV32I-NEXT: sh s4, 0(s3) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -54,38 +54,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_urem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 12(a1) -; RV32IM-NEXT: lhu a3, 8(a1) -; RV32IM-NEXT: lhu a4, 0(a1) -; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lhu a2, 0(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 8(a1) +; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 11038 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: mulhu a5, a2, a5 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a5 ; RV32IM-NEXT: lui a5, 8456 ; RV32IM-NEXT: addi a5, a5, 1058 -; RV32IM-NEXT: mulhu a5, a1, a5 +; RV32IM-NEXT: mulhu a5, a3, a5 ; RV32IM-NEXT: li a6, 124 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sub a3, a3, a5 ; RV32IM-NEXT: lui a5, 10700 ; RV32IM-NEXT: addi a5, a5, -1003 -; RV32IM-NEXT: mulhu a5, a3, a5 +; RV32IM-NEXT: mulhu a5, a4, a5 ; RV32IM-NEXT: li a6, 98 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a3, a3, a5 +; RV32IM-NEXT: sub a4, a4, a5 ; RV32IM-NEXT: lui a5, 1045 ; RV32IM-NEXT: addi a5, a5, 1801 -; RV32IM-NEXT: mulhu a5, a2, a5 +; RV32IM-NEXT: mulhu a5, a1, a5 ; RV32IM-NEXT: li a6, 1003 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_1: @@ -97,29 +97,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s0, 24(a1) -; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu s2, 8(a1) ; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 124 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __umoddi3@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, 1003 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: sh a0, 6(s3) ; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh s2, 2(s3) +; RV64I-NEXT: sh s0, 2(s3) ; RV64I-NEXT: sh s4, 0(s3) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -132,38 +132,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 0(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI0_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI0_0)(a3) -; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lui a2, %hi(.LCPI0_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64IM-NEXT: lhu a3, 0(a1) +; RV64IM-NEXT: lhu a4, 8(a1) ; RV64IM-NEXT: lhu a5, 16(a1) -; RV64IM-NEXT: lhu a1, 8(a1) -; RV64IM-NEXT: mulhu a3, a2, a3 +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: mulhu a2, a3, a2 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) ; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: mulhu a3, a1, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a3, a3, a2 +; RV64IM-NEXT: mulhu a2, a4, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_2) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_2)(a6) ; RV64IM-NEXT: li a7, 124 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: mulhu a3, a5, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a4, a4, a2 +; RV64IM-NEXT: mulhu a2, a5, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_3) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_3)(a6) ; RV64IM-NEXT: li a7, 98 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: mulhu a3, a4, a6 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a5, a5, a2 +; RV64IM-NEXT: mulhu a2, a1, a6 ; RV64IM-NEXT: li a6, 1003 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a5, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -179,29 +179,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s0, 12(a1) -; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu s2, 4(a1) ; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu s2, 12(a1) ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __umodsi3@plt -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: sh a0, 6(s3) ; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh s2, 2(s3) +; RV32I-NEXT: sh s0, 2(s3) ; RV32I-NEXT: sh s4, 0(s3) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -214,29 +214,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_urem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 12(a1) -; RV32IM-NEXT: lhu a3, 8(a1) -; RV32IM-NEXT: lhu a4, 0(a1) -; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lhu a2, 0(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 8(a1) +; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a5, 11038 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a6, a4, a5 +; RV32IM-NEXT: mulhu a6, a2, a5 ; RV32IM-NEXT: li a7, 95 ; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a4, a4, a6 -; RV32IM-NEXT: mulhu a6, a1, a5 -; RV32IM-NEXT: mul a6, a6, a7 -; RV32IM-NEXT: sub a1, a1, a6 +; RV32IM-NEXT: sub a2, a2, a6 ; RV32IM-NEXT: mulhu a6, a3, a5 ; RV32IM-NEXT: mul a6, a6, a7 ; RV32IM-NEXT: sub a3, a3, a6 -; RV32IM-NEXT: mulhu a5, a2, a5 +; RV32IM-NEXT: mulhu a6, a4, a5 +; RV32IM-NEXT: mul a6, a6, a7 +; RV32IM-NEXT: sub a4, a4, a6 +; RV32IM-NEXT: mulhu a5, a1, a5 ; RV32IM-NEXT: mul a5, a5, a7 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_2: @@ -248,29 +248,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s0, 24(a1) -; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu s2, 8(a1) ; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu s2, 24(a1) ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __umoddi3@plt -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: sh a0, 6(s3) ; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh s2, 2(s3) +; RV64I-NEXT: sh s0, 2(s3) ; RV64I-NEXT: sh s4, 0(s3) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -283,29 +283,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 0(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI1_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI1_0)(a3) -; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lui a2, %hi(.LCPI1_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI1_0)(a2) +; RV64IM-NEXT: lhu a3, 0(a1) +; RV64IM-NEXT: lhu a4, 8(a1) ; RV64IM-NEXT: lhu a5, 16(a1) -; RV64IM-NEXT: lhu a1, 8(a1) -; RV64IM-NEXT: mulhu a6, a2, a3 +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: mulhu a6, a3, a2 ; RV64IM-NEXT: li a7, 95 ; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: mulhu a6, a1, a3 +; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: mulhu a6, a4, a2 ; RV64IM-NEXT: mul a6, a6, a7 -; RV64IM-NEXT: subw a1, a1, a6 -; RV64IM-NEXT: mulhu a6, a5, a3 +; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: mulhu a6, a5, a2 ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: subw a5, a5, a6 -; RV64IM-NEXT: mulhu a3, a4, a3 -; RV64IM-NEXT: mul a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: mulhu a2, a1, a2 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a5, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -386,33 +386,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 0(a1) -; RV32IM-NEXT: lhu a3, 4(a1) -; RV32IM-NEXT: lhu a4, 12(a1) +; RV32IM-NEXT: lhu a2, 12(a1) +; RV32IM-NEXT: lhu a3, 0(a1) +; RV32IM-NEXT: lhu a4, 4(a1) ; RV32IM-NEXT: lhu a1, 8(a1) ; RV32IM-NEXT: lui a5, 11038 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a6, a4, a5 +; RV32IM-NEXT: mulhu a6, a2, a5 ; RV32IM-NEXT: li a7, 95 ; RV32IM-NEXT: mul t0, a6, a7 ; RV32IM-NEXT: mulhu t1, a1, a5 ; RV32IM-NEXT: mul t2, t1, a7 -; RV32IM-NEXT: mulhu t3, a3, a5 +; RV32IM-NEXT: mulhu t3, a4, a5 ; RV32IM-NEXT: mul t4, t3, a7 -; RV32IM-NEXT: mulhu a5, a2, a5 +; RV32IM-NEXT: mulhu a5, a3, a5 ; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: sub a2, a2, a7 -; RV32IM-NEXT: add a3, a3, t3 -; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: sub a3, a3, a7 +; RV32IM-NEXT: add a4, a4, t3 +; RV32IM-NEXT: sub a4, a4, t4 ; RV32IM-NEXT: add a1, a1, t1 ; RV32IM-NEXT: sub a1, a1, t2 -; RV32IM-NEXT: add a4, a4, a6 -; RV32IM-NEXT: sub a4, a4, t0 -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: add a2, a2, a6 +; RV32IM-NEXT: sub a2, a2, t0 +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a3, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_urem_udiv: @@ -531,19 +531,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu s1, 0(a1) ; RV32I-NEXT: lhu s2, 4(a1) -; RV32I-NEXT: lhu s3, 0(a1) +; RV32I-NEXT: lhu s3, 8(a1) ; RV32I-NEXT: lhu a2, 12(a1) ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3@plt -; RV32I-NEXT: andi a1, s3, 63 +; RV32I-NEXT: andi a1, s1, 63 ; RV32I-NEXT: andi a2, s2, 31 -; RV32I-NEXT: andi s1, s1, 7 +; RV32I-NEXT: andi a3, s3, 7 ; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh a3, 4(s0) ; RV32I-NEXT: sh a2, 2(s0) ; RV32I-NEXT: sh a1, 0(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -556,23 +556,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_urem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 8(a1) -; RV32IM-NEXT: lhu a3, 4(a1) -; RV32IM-NEXT: lhu a4, 12(a1) -; RV32IM-NEXT: lhu a1, 0(a1) +; RV32IM-NEXT: lhu a2, 12(a1) +; RV32IM-NEXT: lhu a3, 0(a1) +; RV32IM-NEXT: lhu a4, 4(a1) +; RV32IM-NEXT: lhu a1, 8(a1) ; RV32IM-NEXT: lui a5, 11038 ; RV32IM-NEXT: addi a5, a5, -1465 -; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: mulhu a5, a2, a5 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: andi a1, a1, 63 -; RV32IM-NEXT: andi a3, a3, 31 -; RV32IM-NEXT: andi a2, a2, 7 -; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sub a2, a2, a5 +; RV32IM-NEXT: andi a3, a3, 63 +; RV32IM-NEXT: andi a4, a4, 31 +; RV32IM-NEXT: andi a1, a1, 7 +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a3, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_power_of_two: @@ -583,19 +583,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu s1, 0(a1) ; RV64I-NEXT: lhu s2, 8(a1) -; RV64I-NEXT: lhu s3, 0(a1) +; RV64I-NEXT: lhu s3, 16(a1) ; RV64I-NEXT: lhu a2, 24(a1) ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3@plt -; RV64I-NEXT: andi a1, s3, 63 +; RV64I-NEXT: andi a1, s1, 63 ; RV64I-NEXT: andi a2, s2, 31 -; RV64I-NEXT: andi s1, s1, 7 +; RV64I-NEXT: andi a3, s3, 7 ; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh a3, 4(s0) ; RV64I-NEXT: sh a2, 2(s0) ; RV64I-NEXT: sh a1, 0(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -611,19 +611,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lhu a2, 24(a1) ; RV64IM-NEXT: lui a3, %hi(.LCPI3_0) ; RV64IM-NEXT: ld a3, %lo(.LCPI3_0)(a3) -; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a4, 0(a1) ; RV64IM-NEXT: lhu a5, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lhu a1, 16(a1) ; RV64IM-NEXT: mulhu a3, a2, a3 ; RV64IM-NEXT: li a6, 95 ; RV64IM-NEXT: mul a3, a3, a6 ; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: andi a1, a1, 63 +; RV64IM-NEXT: andi a3, a4, 63 ; RV64IM-NEXT: andi a5, a5, 31 -; RV64IM-NEXT: andi a4, a4, 7 -; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: andi a1, a1, 7 +; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: sh a5, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, @@ -640,24 +640,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s0, 12(a1) -; RV32I-NEXT: lhu s1, 8(a1) ; RV32I-NEXT: lhu a2, 4(a1) +; RV32I-NEXT: lhu s0, 8(a1) +; RV32I-NEXT: lhu s1, 12(a1) ; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: li a1, 654 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __umodsi3@plt -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3@plt ; RV32I-NEXT: sh a0, 6(s2) -; RV32I-NEXT: sh s1, 4(s2) +; RV32I-NEXT: sh s0, 4(s2) ; RV32I-NEXT: sh s3, 2(s2) ; RV32I-NEXT: sh zero, 0(s2) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -670,32 +670,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_urem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 12(a1) -; RV32IM-NEXT: lhu a3, 4(a1) -; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: lhu a2, 4(a1) +; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a1, 12(a1) ; RV32IM-NEXT: lui a4, 1603 ; RV32IM-NEXT: addi a4, a4, 1341 -; RV32IM-NEXT: mulhu a4, a3, a4 +; RV32IM-NEXT: mulhu a4, a2, a4 ; RV32IM-NEXT: li a5, 654 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: lui a4, 45590 ; RV32IM-NEXT: addi a4, a4, 1069 -; RV32IM-NEXT: mulhu a4, a1, a4 +; RV32IM-NEXT: mulhu a4, a3, a4 ; RV32IM-NEXT: li a5, 23 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sub a3, a3, a4 ; RV32IM-NEXT: lui a4, 193 ; RV32IM-NEXT: addi a4, a4, 1464 -; RV32IM-NEXT: mulhu a4, a2, a4 +; RV32IM-NEXT: mulhu a4, a1, a4 ; RV32IM-NEXT: lui a5, 1 ; RV32IM-NEXT: addi a5, a5, 1327 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sub a1, a1, a4 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_one: @@ -706,24 +706,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s0, 24(a1) -; RV64I-NEXT: lhu s1, 16(a1) ; RV64I-NEXT: lhu a2, 8(a1) +; RV64I-NEXT: lhu s0, 16(a1) +; RV64I-NEXT: lhu s1, 24(a1) ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: li a1, 654 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __umoddi3@plt -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: sh a0, 6(s2) -; RV64I-NEXT: sh s1, 4(s2) +; RV64I-NEXT: sh s0, 4(s2) ; RV64I-NEXT: sh s3, 2(s2) ; RV64I-NEXT: sh zero, 0(s2) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -736,32 +736,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 8(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI4_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI4_0)(a3) -; RV64IM-NEXT: lhu a4, 24(a1) -; RV64IM-NEXT: lhu a1, 16(a1) -; RV64IM-NEXT: mulhu a3, a2, a3 +; RV64IM-NEXT: lui a2, %hi(.LCPI4_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI4_0)(a2) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: mulhu a2, a3, a2 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_1) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_1)(a5) ; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: mulhu a3, a1, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, a2 +; RV64IM-NEXT: mulhu a2, a4, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) ; RV64IM-NEXT: li a6, 23 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: mulhu a3, a4, a5 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: subw a4, a4, a2 +; RV64IM-NEXT: mulhu a2, a1, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addi a5, a5, 1327 -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: subw a4, a4, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: subw a1, a1, a2 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -791,49 +791,50 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: lw s0, 0(a1) +; RV32I-NEXT: lw s1, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s3, 12(a1) +; RV32I-NEXT: lw a3, 16(a1) +; RV32I-NEXT: lw a4, 20(a1) +; RV32I-NEXT: lw s4, 24(a1) +; RV32I-NEXT: lw s5, 28(a1) ; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: li a2, 1 +; RV32I-NEXT: li a2, 23 ; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 -; RV32I-NEXT: li a2, 654 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 ; RV32I-NEXT: mv a0, s4 ; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: mv s5, a1 -; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: li a2, 654 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: call __umoddi3@plt +; RV32I-NEXT: sw s5, 28(s6) +; RV32I-NEXT: sw s4, 24(s6) +; RV32I-NEXT: sw s8, 20(s6) +; RV32I-NEXT: sw s7, 16(s6) +; RV32I-NEXT: sw a1, 12(s6) +; RV32I-NEXT: sw a0, 8(s6) +; RV32I-NEXT: sw s1, 4(s6) +; RV32I-NEXT: sw s0, 0(s6) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -860,49 +861,50 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw s0, 0(a1) +; RV32IM-NEXT: lw s1, 4(a1) +; RV32IM-NEXT: lw s2, 8(a1) +; RV32IM-NEXT: lw s3, 12(a1) +; RV32IM-NEXT: lw a3, 16(a1) +; RV32IM-NEXT: lw a4, 20(a1) +; RV32IM-NEXT: lw s4, 24(a1) +; RV32IM-NEXT: lw s5, 28(a1) ; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: li a2, 1 +; RV32IM-NEXT: li a2, 23 ; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a1, a4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: li a2, 654 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 ; RV32IM-NEXT: mv a0, s4 ; RV32IM-NEXT: mv a1, s5 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt ; RV32IM-NEXT: mv s4, a0 ; RV32IM-NEXT: mv s5, a1 -; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, s0 ; RV32IM-NEXT: mv a1, s1 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: li a2, 654 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: li a3, 0 +; RV32IM-NEXT: call __umoddi3@plt +; RV32IM-NEXT: sw s5, 28(s6) +; RV32IM-NEXT: sw s4, 24(s6) +; RV32IM-NEXT: sw s8, 20(s6) +; RV32IM-NEXT: sw s7, 16(s6) +; RV32IM-NEXT: sw a1, 12(s6) +; RV32IM-NEXT: sw a0, 8(s6) +; RV32IM-NEXT: sw s1, 4(s6) +; RV32IM-NEXT: sw s0, 0(s6) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -924,24 +926,24 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: ld s0, 24(a1) -; RV64I-NEXT: ld s1, 16(a1) ; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: ld s0, 16(a1) +; RV64I-NEXT: ld s1, 24(a1) ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: li a1, 654 ; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __umoddi3@plt -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3@plt ; RV64I-NEXT: sd a0, 24(s2) -; RV64I-NEXT: sd s1, 16(s2) +; RV64I-NEXT: sd s0, 16(s2) ; RV64I-NEXT: sd s3, 8(s2) ; RV64I-NEXT: sd zero, 0(s2) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -954,39 +956,39 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; ; RV64IM-LABEL: dont_fold_urem_i64: ; RV64IM: # %bb.0: -; RV64IM-NEXT: ld a2, 16(a1) -; RV64IM-NEXT: lui a3, %hi(.LCPI6_0) -; RV64IM-NEXT: ld a3, %lo(.LCPI6_0)(a3) -; RV64IM-NEXT: ld a4, 24(a1) -; RV64IM-NEXT: ld a1, 8(a1) -; RV64IM-NEXT: mulhu a3, a2, a3 -; RV64IM-NEXT: sub a5, a2, a3 +; RV64IM-NEXT: lui a2, %hi(.LCPI6_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64IM-NEXT: ld a3, 16(a1) +; RV64IM-NEXT: ld a4, 8(a1) +; RV64IM-NEXT: ld a1, 24(a1) +; RV64IM-NEXT: mulhu a2, a3, a2 +; RV64IM-NEXT: sub a5, a3, a2 ; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a3, a5, a3 -; RV64IM-NEXT: srli a3, a3, 4 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: srli a2, a2, 4 ; RV64IM-NEXT: li a5, 23 ; RV64IM-NEXT: lui a6, %hi(.LCPI6_1) ; RV64IM-NEXT: ld a6, %lo(.LCPI6_1)(a6) -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: sub a2, a2, a3 -; RV64IM-NEXT: srli a3, a1, 1 -; RV64IM-NEXT: mulhu a3, a3, a6 -; RV64IM-NEXT: srli a3, a3, 7 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: srli a2, a4, 1 +; RV64IM-NEXT: mulhu a2, a2, a6 +; RV64IM-NEXT: srli a2, a2, 7 ; RV64IM-NEXT: lui a5, %hi(.LCPI6_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5) ; RV64IM-NEXT: li a6, 654 -; RV64IM-NEXT: mul a3, a3, a6 -; RV64IM-NEXT: sub a1, a1, a3 -; RV64IM-NEXT: mulhu a3, a4, a5 -; RV64IM-NEXT: srli a3, a3, 12 +; RV64IM-NEXT: mul a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, a2 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 12 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a3, a3, a5 -; RV64IM-NEXT: sub a4, a4, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sd zero, 0(a0) -; RV64IM-NEXT: sd a4, 24(a0) -; RV64IM-NEXT: sd a1, 8(a0) -; RV64IM-NEXT: sd a2, 16(a0) +; RV64IM-NEXT: sd a1, 24(a0) +; RV64IM-NEXT: sd a4, 8(a0) +; RV64IM-NEXT: sd a3, 16(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index 67d1bfac4d614..a847bdacc9cb0 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -498,11 +498,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 20 ; ILP32-ILP32F-FPELIM-NEXT: sw a0, 12(sp) ; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 27 -; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -8 -; ILP32-ILP32F-FPELIM-NEXT: addi a1, sp, 35 -; ILP32-ILP32F-FPELIM-NEXT: sw a1, 12(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a1, 4(a0) -; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0) +; ILP32-ILP32F-FPELIM-NEXT: andi a1, a0, -8 +; ILP32-ILP32F-FPELIM-NEXT: addi a0, sp, 35 +; ILP32-ILP32F-FPELIM-NEXT: sw a0, 12(sp) +; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a1) +; ILP32-ILP32F-FPELIM-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 48 ; ILP32-ILP32F-FPELIM-NEXT: ret ; @@ -522,11 +522,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 4 ; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 11 -; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -8 -; ILP32-ILP32F-WITHFP-NEXT: addi a1, s0, 19 -; ILP32-ILP32F-WITHFP-NEXT: sw a1, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a1, 4(a0) -; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0) +; ILP32-ILP32F-WITHFP-NEXT: andi a1, a0, -8 +; ILP32-ILP32F-WITHFP-NEXT: addi a0, s0, 19 +; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) +; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a1) +; ILP32-ILP32F-WITHFP-NEXT: lw a1, 4(a1) ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: addi sp, sp, 48 @@ -545,11 +545,11 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 20 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 12(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 27 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -8 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, sp, 35 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a1, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a1, 4(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a1, a0, -8 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a0, sp, 35 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a1) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a1, 4(a1) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index b0d435368e92b..c712421f16acb 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -38,17 +38,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -102,17 +102,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -136,8 +136,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -166,17 +166,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -198,51 +198,51 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 5(a1) -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 1(a1) -; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: lbu a7, 3(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: lbu a3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a3, a7, a6 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a3, a3, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -272,17 +272,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a5, a1, 3 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: srl a1, a3, a5 @@ -334,51 +334,51 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 5(a1) -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 1(a1) -; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: lbu a7, 3(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: lbu a3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a3, a7, a6 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a3, a3, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -408,17 +408,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a5, a1, 3 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: sll a1, a3, a5 @@ -470,51 +470,51 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 5(a1) -; RV64I-NEXT: lbu a4, 4(a1) -; RV64I-NEXT: lbu a5, 6(a1) -; RV64I-NEXT: lbu a6, 7(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 1(a1) -; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: lbu a7, 3(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: lbu a3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a3, a3, 35 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a3, a7, a6 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a3, a3, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -544,18 +544,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a4, a6, 24 ; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 1(a1) ; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: lbu a5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a5, a1, 3 +; RV32I-NEXT: or a5, a1, a6 +; RV32I-NEXT: slli a5, a5, 3 ; RV32I-NEXT: addi a6, a5, -32 ; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 @@ -607,51 +607,51 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 13(a0) -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 1(a1) -; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) ; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: lbu t0, 3(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a4, a4, 35 -; RV64I-NEXT: or a5, a4, a1 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a5, a1, a4 ; RV64I-NEXT: addi a4, a5, -64 ; RV64I-NEXT: srl a1, a3, a5 ; RV64I-NEXT: bltz a4, .LBB6_2 @@ -659,25 +659,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: -; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) ; RV64I-NEXT: lbu t0, 2(a0) ; RV64I-NEXT: lbu t1, 3(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: lbu t2, 4(a0) +; RV64I-NEXT: lbu t3, 5(a0) +; RV64I-NEXT: lbu t4, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t4 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 @@ -779,38 +779,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: addi a0, sp, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 12(a0) +; RV32I-NEXT: lbu s0, 13(a0) +; RV32I-NEXT: lbu s1, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb a0, 15(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -826,51 +826,51 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 6(a1) -; RV64I-NEXT: lbu a7, 7(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 1(a1) -; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) ; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: lbu t0, 3(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a4, a4, 35 -; RV64I-NEXT: or a5, a4, a1 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a5, a1, a4 ; RV64I-NEXT: addi a4, a5, -64 ; RV64I-NEXT: sll a1, a3, a5 ; RV64I-NEXT: bltz a4, .LBB7_2 @@ -878,25 +878,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) +; RV64I-NEXT: lbu a6, 8(a0) +; RV64I-NEXT: lbu a7, 9(a0) ; RV64I-NEXT: lbu t0, 10(a0) ; RV64I-NEXT: lbu t1, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: lbu t2, 12(a0) +; RV64I-NEXT: lbu t3, 13(a0) +; RV64I-NEXT: lbu t4, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t4 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 @@ -998,38 +998,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: addi a0, sp, 20 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 12(a0) +; RV32I-NEXT: lbu s0, 13(a0) +; RV32I-NEXT: lbu s1, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb a0, 15(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -1045,51 +1045,51 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 13(a0) -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: slli a5, a4, 32 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu a7, 6(a1) -; RV64I-NEXT: lbu t0, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) ; RV64I-NEXT: lbu t0, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: lbu t2, 4(a1) +; RV64I-NEXT: lbu t3, 5(a1) +; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: lbu a5, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or t2, t3, t2 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: or a1, a1, a6 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a5, a5, 35 -; RV64I-NEXT: or a5, a5, a1 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, t1, t0 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: or a5, a1, a5 ; RV64I-NEXT: addi a6, a5, -64 ; RV64I-NEXT: sra a1, a3, a5 ; RV64I-NEXT: bltz a6, .LBB8_2 @@ -1099,25 +1099,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a6, 1(a0) ; RV64I-NEXT: lbu a7, 2(a0) ; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a6 +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) +; RV64I-NEXT: lbu t3, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t3 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a4 @@ -1167,94 +1167,94 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 15(a0) -; RV32I-NEXT: slli a4, a3, 24 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t2, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 14(a0) -; RV32I-NEXT: lbu a0, 13(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu t0, 4(a0) +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu s1, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu a0, 14(a0) +; RV32I-NEXT: slli s3, a3, 24 ; RV32I-NEXT: lbu a1, 0(a1) ; RV32I-NEXT: sb a3, 15(sp) -; RV32I-NEXT: sb s3, 14(sp) -; RV32I-NEXT: sb a0, 13(sp) -; RV32I-NEXT: sb s2, 12(sp) -; RV32I-NEXT: sb s1, 11(sp) -; RV32I-NEXT: sb s0, 10(sp) -; RV32I-NEXT: sb t6, 9(sp) -; RV32I-NEXT: sb t5, 8(sp) -; RV32I-NEXT: sb t4, 7(sp) -; RV32I-NEXT: sb t3, 6(sp) -; RV32I-NEXT: sb t2, 5(sp) -; RV32I-NEXT: sb t1, 4(sp) -; RV32I-NEXT: sb t0, 3(sp) -; RV32I-NEXT: sb a7, 2(sp) -; RV32I-NEXT: sb a6, 1(sp) -; RV32I-NEXT: sb a5, 0(sp) -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: sb a4, 28(sp) -; RV32I-NEXT: sb a4, 24(sp) -; RV32I-NEXT: sb a4, 20(sp) -; RV32I-NEXT: sb a4, 16(sp) -; RV32I-NEXT: srli a0, a4, 24 -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: sb a0, 23(sp) -; RV32I-NEXT: sb a3, 22(sp) -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: sb a0, 19(sp) -; RV32I-NEXT: sb a3, 18(sp) -; RV32I-NEXT: sb a4, 17(sp) +; RV32I-NEXT: sb a0, 14(sp) +; RV32I-NEXT: sb s2, 13(sp) +; RV32I-NEXT: sb s1, 12(sp) +; RV32I-NEXT: sb s0, 11(sp) +; RV32I-NEXT: sb t6, 10(sp) +; RV32I-NEXT: sb t5, 9(sp) +; RV32I-NEXT: sb t4, 8(sp) +; RV32I-NEXT: sb t3, 7(sp) +; RV32I-NEXT: sb t2, 6(sp) +; RV32I-NEXT: sb t1, 5(sp) +; RV32I-NEXT: sb t0, 4(sp) +; RV32I-NEXT: sb a7, 3(sp) +; RV32I-NEXT: sb a6, 2(sp) +; RV32I-NEXT: sb a5, 1(sp) +; RV32I-NEXT: sb a4, 0(sp) +; RV32I-NEXT: srai a0, s3, 31 +; RV32I-NEXT: sb a0, 28(sp) +; RV32I-NEXT: sb a0, 24(sp) +; RV32I-NEXT: sb a0, 20(sp) +; RV32I-NEXT: sb a0, 16(sp) +; RV32I-NEXT: srli a3, a0, 24 +; RV32I-NEXT: sb a3, 31(sp) +; RV32I-NEXT: srli a4, a0, 16 +; RV32I-NEXT: sb a4, 30(sp) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 29(sp) +; RV32I-NEXT: sb a3, 27(sp) +; RV32I-NEXT: sb a4, 26(sp) +; RV32I-NEXT: sb a0, 25(sp) +; RV32I-NEXT: sb a3, 23(sp) +; RV32I-NEXT: sb a4, 22(sp) +; RV32I-NEXT: sb a0, 21(sp) +; RV32I-NEXT: sb a3, 19(sp) +; RV32I-NEXT: sb a4, 18(sp) +; RV32I-NEXT: sb a0, 17(sp) ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 12(a0) +; RV32I-NEXT: lbu s0, 13(a0) +; RV32I-NEXT: lbu s1, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb a0, 15(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -1286,18 +1286,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv a5, a1 +; RV64I-NEXT: lbu a7, 30(a0) +; RV64I-NEXT: lbu a6, 31(a0) +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -1318,19 +1321,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: lbu ra, 24(a0) ; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) +; RV64I-NEXT: lbu a4, 26(a0) +; RV64I-NEXT: lbu a3, 27(a0) +; RV64I-NEXT: lbu a1, 28(a0) ; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 87(sp) -; RV64I-NEXT: sb a4, 86(sp) +; RV64I-NEXT: lbu a5, 0(a5) +; RV64I-NEXT: sb a6, 87(sp) +; RV64I-NEXT: sb a7, 86(sp) ; RV64I-NEXT: sb a0, 85(sp) -; RV64I-NEXT: sb a5, 84(sp) -; RV64I-NEXT: sb a6, 83(sp) -; RV64I-NEXT: sb a7, 82(sp) +; RV64I-NEXT: sb a1, 84(sp) +; RV64I-NEXT: sb a3, 83(sp) +; RV64I-NEXT: sb a4, 82(sp) ; RV64I-NEXT: sb zero, 119(sp) ; RV64I-NEXT: sb zero, 118(sp) ; RV64I-NEXT: sb zero, 117(sp) @@ -1395,80 +1396,80 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: andi a1, a1, 31 +; RV64I-NEXT: andi a5, a5, 31 ; RV64I-NEXT: addi a0, sp, 56 -; RV64I-NEXT: add a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: add t1, a0, a5 +; RV64I-NEXT: lbu a0, 0(t1) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 1(t1) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a6, 2(t1) +; RV64I-NEXT: lbu t0, 3(t1) +; RV64I-NEXT: lbu t2, 4(t1) +; RV64I-NEXT: lbu t3, 5(t1) +; RV64I-NEXT: lbu t4, 6(t1) +; RV64I-NEXT: lbu t5, 7(t1) +; RV64I-NEXT: lbu a0, 8(t1) +; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 9(t1) +; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 10(t1) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: lbu a7, 11(t1) +; RV64I-NEXT: lbu t6, 12(t1) +; RV64I-NEXT: lbu s0, 13(t1) +; RV64I-NEXT: lbu s1, 14(t1) +; RV64I-NEXT: lbu s2, 15(t1) +; RV64I-NEXT: lbu s3, 16(t1) +; RV64I-NEXT: lbu s4, 17(t1) +; RV64I-NEXT: lbu s5, 18(t1) +; RV64I-NEXT: lbu s6, 19(t1) +; RV64I-NEXT: lbu s7, 20(t1) +; RV64I-NEXT: lbu s8, 21(t1) +; RV64I-NEXT: lbu s9, 22(t1) +; RV64I-NEXT: lbu s10, 23(t1) +; RV64I-NEXT: lbu s11, 24(t1) +; RV64I-NEXT: lbu ra, 25(t1) +; RV64I-NEXT: lbu a5, 26(t1) +; RV64I-NEXT: lbu a4, 27(t1) +; RV64I-NEXT: lbu a3, 28(t1) +; RV64I-NEXT: lbu a1, 29(t1) +; RV64I-NEXT: lbu a0, 30(t1) +; RV64I-NEXT: lbu t1, 31(t1) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb t1, 31(a2) +; RV64I-NEXT: sb a0, 30(a2) +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t3, 5(a2) +; RV64I-NEXT: sb t2, 4(a2) +; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb a6, 2(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 9(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 8(a2) @@ -1504,18 +1505,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: lbu a7, 30(a0) +; RV32I-NEXT: lbu a6, 31(a0) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) @@ -1536,19 +1540,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: lbu ra, 24(a0) ; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) +; RV32I-NEXT: lbu a4, 26(a0) +; RV32I-NEXT: lbu a3, 27(a0) +; RV32I-NEXT: lbu a1, 28(a0) ; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 59(sp) -; RV32I-NEXT: sb a4, 58(sp) +; RV32I-NEXT: lbu a5, 0(a5) +; RV32I-NEXT: sb a6, 59(sp) +; RV32I-NEXT: sb a7, 58(sp) ; RV32I-NEXT: sb a0, 57(sp) -; RV32I-NEXT: sb a5, 56(sp) -; RV32I-NEXT: sb a6, 55(sp) -; RV32I-NEXT: sb a7, 54(sp) +; RV32I-NEXT: sb a1, 56(sp) +; RV32I-NEXT: sb a3, 55(sp) +; RV32I-NEXT: sb a4, 54(sp) ; RV32I-NEXT: sb zero, 91(sp) ; RV32I-NEXT: sb zero, 90(sp) ; RV32I-NEXT: sb zero, 89(sp) @@ -1613,82 +1615,82 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 29(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: andi a1, a1, 31 +; RV32I-NEXT: andi a5, a5, 31 ; RV32I-NEXT: addi a0, sp, 28 -; RV32I-NEXT: add a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: add t1, a0, a5 +; RV32I-NEXT: lbu a0, 0(t1) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 1(t1) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a7, 2(t1) +; RV32I-NEXT: lbu t0, 3(t1) +; RV32I-NEXT: lbu a0, 4(t1) +; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 5(t1) +; RV32I-NEXT: lbu a0, 6(t1) +; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a0, 7(t1) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: lbu t2, 8(t1) +; RV32I-NEXT: lbu t3, 9(t1) +; RV32I-NEXT: lbu t4, 10(t1) +; RV32I-NEXT: lbu t5, 11(t1) +; RV32I-NEXT: lbu t6, 12(t1) +; RV32I-NEXT: lbu s0, 13(t1) +; RV32I-NEXT: lbu s1, 14(t1) +; RV32I-NEXT: lbu s2, 15(t1) +; RV32I-NEXT: lbu s3, 16(t1) +; RV32I-NEXT: lbu s4, 17(t1) +; RV32I-NEXT: lbu s5, 18(t1) +; RV32I-NEXT: lbu s6, 19(t1) +; RV32I-NEXT: lbu s7, 20(t1) +; RV32I-NEXT: lbu s8, 21(t1) +; RV32I-NEXT: lbu s9, 22(t1) +; RV32I-NEXT: lbu s10, 23(t1) +; RV32I-NEXT: lbu s11, 24(t1) +; RV32I-NEXT: lbu ra, 25(t1) +; RV32I-NEXT: lbu a4, 26(t1) +; RV32I-NEXT: lbu a0, 27(t1) +; RV32I-NEXT: lbu a3, 28(t1) +; RV32I-NEXT: lbu a1, 29(t1) +; RV32I-NEXT: lbu a5, 30(t1) +; RV32I-NEXT: lbu t1, 31(t1) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a0, 27(a2) +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: sb a1, 29(a2) +; RV32I-NEXT: sb a3, 28(a2) +; RV32I-NEXT: sb t1, 31(a2) +; RV32I-NEXT: sb a5, 30(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a6, 5(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 6(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -1729,18 +1731,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv a5, a1 +; RV64I-NEXT: lbu a7, 30(a0) +; RV64I-NEXT: lbu a6, 31(a0) +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -1761,19 +1766,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: lbu ra, 24(a0) ; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) +; RV64I-NEXT: lbu a4, 26(a0) +; RV64I-NEXT: lbu a3, 27(a0) +; RV64I-NEXT: lbu a1, 28(a0) ; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 119(sp) -; RV64I-NEXT: sb a4, 118(sp) +; RV64I-NEXT: lbu a5, 0(a5) +; RV64I-NEXT: sb a6, 119(sp) +; RV64I-NEXT: sb a7, 118(sp) ; RV64I-NEXT: sb a0, 117(sp) -; RV64I-NEXT: sb a5, 116(sp) -; RV64I-NEXT: sb a6, 115(sp) -; RV64I-NEXT: sb a7, 114(sp) +; RV64I-NEXT: sb a1, 116(sp) +; RV64I-NEXT: sb a3, 115(sp) +; RV64I-NEXT: sb a4, 114(sp) ; RV64I-NEXT: sb zero, 87(sp) ; RV64I-NEXT: sb zero, 86(sp) ; RV64I-NEXT: sb zero, 85(sp) @@ -1838,80 +1841,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 89(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: andi a1, a1, 31 +; RV64I-NEXT: andi a5, a5, 31 ; RV64I-NEXT: addi a0, sp, 88 -; RV64I-NEXT: sub a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: sub t1, a0, a5 +; RV64I-NEXT: lbu a0, 0(t1) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 1(t1) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a6, 2(t1) +; RV64I-NEXT: lbu t0, 3(t1) +; RV64I-NEXT: lbu t2, 4(t1) +; RV64I-NEXT: lbu t3, 5(t1) +; RV64I-NEXT: lbu t4, 6(t1) +; RV64I-NEXT: lbu t5, 7(t1) +; RV64I-NEXT: lbu a0, 8(t1) +; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 9(t1) +; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 10(t1) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: lbu a7, 11(t1) +; RV64I-NEXT: lbu t6, 12(t1) +; RV64I-NEXT: lbu s0, 13(t1) +; RV64I-NEXT: lbu s1, 14(t1) +; RV64I-NEXT: lbu s2, 15(t1) +; RV64I-NEXT: lbu s3, 16(t1) +; RV64I-NEXT: lbu s4, 17(t1) +; RV64I-NEXT: lbu s5, 18(t1) +; RV64I-NEXT: lbu s6, 19(t1) +; RV64I-NEXT: lbu s7, 20(t1) +; RV64I-NEXT: lbu s8, 21(t1) +; RV64I-NEXT: lbu s9, 22(t1) +; RV64I-NEXT: lbu s10, 23(t1) +; RV64I-NEXT: lbu s11, 24(t1) +; RV64I-NEXT: lbu ra, 25(t1) +; RV64I-NEXT: lbu a5, 26(t1) +; RV64I-NEXT: lbu a4, 27(t1) +; RV64I-NEXT: lbu a3, 28(t1) +; RV64I-NEXT: lbu a1, 29(t1) +; RV64I-NEXT: lbu a0, 30(t1) +; RV64I-NEXT: lbu t1, 31(t1) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb t1, 31(a2) +; RV64I-NEXT: sb a0, 30(a2) +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t3, 5(a2) +; RV64I-NEXT: sb t2, 4(a2) +; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb a6, 2(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 9(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 8(a2) @@ -1947,18 +1950,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv a5, a1 +; RV32I-NEXT: lbu a7, 30(a0) +; RV32I-NEXT: lbu a6, 31(a0) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) @@ -1979,19 +1985,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: lbu ra, 24(a0) ; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) +; RV32I-NEXT: lbu a4, 26(a0) +; RV32I-NEXT: lbu a3, 27(a0) +; RV32I-NEXT: lbu a1, 28(a0) ; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 91(sp) -; RV32I-NEXT: sb a4, 90(sp) +; RV32I-NEXT: lbu a5, 0(a5) +; RV32I-NEXT: sb a6, 91(sp) +; RV32I-NEXT: sb a7, 90(sp) ; RV32I-NEXT: sb a0, 89(sp) -; RV32I-NEXT: sb a5, 88(sp) -; RV32I-NEXT: sb a6, 87(sp) -; RV32I-NEXT: sb a7, 86(sp) +; RV32I-NEXT: sb a1, 88(sp) +; RV32I-NEXT: sb a3, 87(sp) +; RV32I-NEXT: sb a4, 86(sp) ; RV32I-NEXT: sb zero, 59(sp) ; RV32I-NEXT: sb zero, 58(sp) ; RV32I-NEXT: sb zero, 57(sp) @@ -2056,82 +2060,82 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 61(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: andi a1, a1, 31 +; RV32I-NEXT: andi a5, a5, 31 ; RV32I-NEXT: addi a0, sp, 60 -; RV32I-NEXT: sub a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: sub t1, a0, a5 +; RV32I-NEXT: lbu a0, 0(t1) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 1(t1) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a7, 2(t1) +; RV32I-NEXT: lbu t0, 3(t1) +; RV32I-NEXT: lbu a0, 4(t1) +; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 5(t1) +; RV32I-NEXT: lbu a0, 6(t1) +; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a0, 7(t1) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: lbu t2, 8(t1) +; RV32I-NEXT: lbu t3, 9(t1) +; RV32I-NEXT: lbu t4, 10(t1) +; RV32I-NEXT: lbu t5, 11(t1) +; RV32I-NEXT: lbu t6, 12(t1) +; RV32I-NEXT: lbu s0, 13(t1) +; RV32I-NEXT: lbu s1, 14(t1) +; RV32I-NEXT: lbu s2, 15(t1) +; RV32I-NEXT: lbu s3, 16(t1) +; RV32I-NEXT: lbu s4, 17(t1) +; RV32I-NEXT: lbu s5, 18(t1) +; RV32I-NEXT: lbu s6, 19(t1) +; RV32I-NEXT: lbu s7, 20(t1) +; RV32I-NEXT: lbu s8, 21(t1) +; RV32I-NEXT: lbu s9, 22(t1) +; RV32I-NEXT: lbu s10, 23(t1) +; RV32I-NEXT: lbu s11, 24(t1) +; RV32I-NEXT: lbu ra, 25(t1) +; RV32I-NEXT: lbu a4, 26(t1) +; RV32I-NEXT: lbu a0, 27(t1) +; RV32I-NEXT: lbu a3, 28(t1) +; RV32I-NEXT: lbu a1, 29(t1) +; RV32I-NEXT: lbu a5, 30(t1) +; RV32I-NEXT: lbu t1, 31(t1) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a0, 27(a2) +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: sb a1, 29(a2) +; RV32I-NEXT: sb a3, 28(a2) +; RV32I-NEXT: sb t1, 31(a2) +; RV32I-NEXT: sb a5, 30(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a6, 5(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 6(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2172,20 +2176,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv t0, a1 -; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: mv t1, a1 +; RV64I-NEXT: lbu t0, 30(a0) ; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: lbu a1, 2(a0) ; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: lbu a1, 3(a0) ; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: lbu a1, 4(a0) ; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t2, 6(a0) ; RV64I-NEXT: lbu t3, 7(a0) ; RV64I-NEXT: lbu t4, 8(a0) @@ -2208,18 +2212,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a5, 26(a0) ; RV64I-NEXT: lbu a4, 27(a0) -; RV64I-NEXT: lbu a1, 30(a0) -; RV64I-NEXT: lbu a3, 29(a0) -; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: lbu t0, 0(t0) -; RV64I-NEXT: sb a1, 86(sp) -; RV64I-NEXT: sb a3, 85(sp) -; RV64I-NEXT: sb a0, 84(sp) +; RV64I-NEXT: lbu a3, 28(a0) +; RV64I-NEXT: lbu a1, 29(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: lbu t1, 0(t1) +; RV64I-NEXT: sd t1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sb t0, 86(sp) +; RV64I-NEXT: sb a1, 85(sp) +; RV64I-NEXT: sb a3, 84(sp) ; RV64I-NEXT: sb a4, 83(sp) ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 +; RV64I-NEXT: sb a0, 87(sp) +; RV64I-NEXT: slli a0, a0, 56 ; RV64I-NEXT: sb a7, 80(sp) ; RV64I-NEXT: sb ra, 79(sp) ; RV64I-NEXT: sb s11, 78(sp) @@ -2239,19 +2244,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb t4, 64(sp) ; RV64I-NEXT: sb t3, 63(sp) ; RV64I-NEXT: sb t2, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 +; RV64I-NEXT: ld a1, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 61(sp) +; RV64I-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 60(sp) +; RV64I-NEXT: ld a1, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 59(sp) +; RV64I-NEXT: ld a1, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 58(sp) +; RV64I-NEXT: ld a1, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 57(sp) +; RV64I-NEXT: ld a1, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 56(sp) +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2291,80 +2296,81 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: andi a0, t0, 31 +; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: andi a0, a0, 31 ; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a6, a1, a0 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) +; RV64I-NEXT: add t1, a1, a0 +; RV64I-NEXT: lbu a0, 0(t1) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) +; RV64I-NEXT: lbu a0, 1(t1) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) +; RV64I-NEXT: lbu a6, 2(t1) +; RV64I-NEXT: lbu t0, 3(t1) +; RV64I-NEXT: lbu t2, 4(t1) +; RV64I-NEXT: lbu t3, 5(t1) +; RV64I-NEXT: lbu t4, 6(t1) +; RV64I-NEXT: lbu t5, 7(t1) +; RV64I-NEXT: lbu a0, 8(t1) +; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 9(t1) +; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a0, 10(t1) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: lbu a7, 11(t1) +; RV64I-NEXT: lbu t6, 12(t1) +; RV64I-NEXT: lbu s0, 13(t1) +; RV64I-NEXT: lbu s1, 14(t1) +; RV64I-NEXT: lbu s2, 15(t1) +; RV64I-NEXT: lbu s3, 16(t1) +; RV64I-NEXT: lbu s4, 17(t1) +; RV64I-NEXT: lbu s5, 18(t1) +; RV64I-NEXT: lbu s6, 19(t1) +; RV64I-NEXT: lbu s7, 20(t1) +; RV64I-NEXT: lbu s8, 21(t1) +; RV64I-NEXT: lbu s9, 22(t1) +; RV64I-NEXT: lbu s10, 23(t1) +; RV64I-NEXT: lbu s11, 24(t1) +; RV64I-NEXT: lbu ra, 25(t1) +; RV64I-NEXT: lbu a5, 26(t1) +; RV64I-NEXT: lbu a4, 27(t1) +; RV64I-NEXT: lbu a3, 28(t1) +; RV64I-NEXT: lbu a1, 29(t1) +; RV64I-NEXT: lbu a0, 30(t1) +; RV64I-NEXT: lbu t1, 31(t1) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb t1, 31(a2) +; RV64I-NEXT: sb a0, 30(a2) +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb t5, 7(a2) +; RV64I-NEXT: sb t4, 6(a2) +; RV64I-NEXT: sb t3, 5(a2) +; RV64I-NEXT: sb t2, 4(a2) +; RV64I-NEXT: sb t0, 3(a2) +; RV64I-NEXT: sb a6, 2(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb a7, 11(a2) +; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 9(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 8(a2) @@ -2400,20 +2406,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv t0, a1 -; RV32I-NEXT: lbu t1, 31(a0) +; RV32I-NEXT: mv t1, a1 +; RV32I-NEXT: lbu t0, 30(a0) ; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: lbu a1, 2(a0) ; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: lbu a1, 3(a0) ; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: lbu a1, 4(a0) ; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu t2, 6(a0) ; RV32I-NEXT: lbu t3, 7(a0) ; RV32I-NEXT: lbu t4, 8(a0) @@ -2436,18 +2442,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a6, 25(a0) ; RV32I-NEXT: lbu a5, 26(a0) ; RV32I-NEXT: lbu a4, 27(a0) -; RV32I-NEXT: lbu a1, 30(a0) -; RV32I-NEXT: lbu a3, 29(a0) -; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: lbu t0, 0(t0) -; RV32I-NEXT: sb a1, 58(sp) -; RV32I-NEXT: sb a3, 57(sp) -; RV32I-NEXT: sb a0, 56(sp) +; RV32I-NEXT: lbu a3, 28(a0) +; RV32I-NEXT: lbu a1, 29(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: lbu t1, 0(t1) +; RV32I-NEXT: sw t1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sb t0, 58(sp) +; RV32I-NEXT: sb a1, 57(sp) +; RV32I-NEXT: sb a3, 56(sp) ; RV32I-NEXT: sb a4, 55(sp) ; RV32I-NEXT: sb a5, 54(sp) ; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb t1, 59(sp) -; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: sb a0, 59(sp) +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: sb a7, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) @@ -2467,19 +2474,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb t4, 36(sp) ; RV32I-NEXT: sb t3, 35(sp) ; RV32I-NEXT: sb t2, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t1, 31 +; RV32I-NEXT: lw a1, 0(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 33(sp) +; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 32(sp) +; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 31(sp) +; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 30(sp) +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 29(sp) +; RV32I-NEXT: lw a1, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 28(sp) +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: sb a0, 88(sp) ; RV32I-NEXT: sb a0, 84(sp) ; RV32I-NEXT: sb a0, 80(sp) @@ -2515,82 +2522,83 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 63(sp) ; RV32I-NEXT: sb a3, 62(sp) ; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: andi a0, t0, 31 +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: andi a0, a0, 31 ; RV32I-NEXT: addi a1, sp, 28 -; RV32I-NEXT: add a6, a1, a0 -; RV32I-NEXT: lbu a0, 6(a6) +; RV32I-NEXT: add t1, a1, a0 +; RV32I-NEXT: lbu a0, 0(t1) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) +; RV32I-NEXT: lbu a0, 1(t1) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) +; RV32I-NEXT: lbu a7, 2(t1) +; RV32I-NEXT: lbu t0, 3(t1) +; RV32I-NEXT: lbu a0, 4(t1) +; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 5(t1) +; RV32I-NEXT: lbu a0, 6(t1) +; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a0, 7(t1) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: lbu t2, 8(t1) +; RV32I-NEXT: lbu t3, 9(t1) +; RV32I-NEXT: lbu t4, 10(t1) +; RV32I-NEXT: lbu t5, 11(t1) +; RV32I-NEXT: lbu t6, 12(t1) +; RV32I-NEXT: lbu s0, 13(t1) +; RV32I-NEXT: lbu s1, 14(t1) +; RV32I-NEXT: lbu s2, 15(t1) +; RV32I-NEXT: lbu s3, 16(t1) +; RV32I-NEXT: lbu s4, 17(t1) +; RV32I-NEXT: lbu s5, 18(t1) +; RV32I-NEXT: lbu s6, 19(t1) +; RV32I-NEXT: lbu s7, 20(t1) +; RV32I-NEXT: lbu s8, 21(t1) +; RV32I-NEXT: lbu s9, 22(t1) +; RV32I-NEXT: lbu s10, 23(t1) +; RV32I-NEXT: lbu s11, 24(t1) +; RV32I-NEXT: lbu ra, 25(t1) +; RV32I-NEXT: lbu a4, 26(t1) +; RV32I-NEXT: lbu a0, 27(t1) +; RV32I-NEXT: lbu a3, 28(t1) +; RV32I-NEXT: lbu a1, 29(t1) +; RV32I-NEXT: lbu a5, 30(t1) +; RV32I-NEXT: lbu t1, 31(t1) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb a0, 27(a2) +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: sb a1, 29(a2) +; RV32I-NEXT: sb a3, 28(a2) +; RV32I-NEXT: sb t1, 31(a2) +; RV32I-NEXT: sb a5, 30(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a6, 5(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 6(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index a601256bc2afa..381e7b75080eb 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -37,17 +37,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: srli a1, a0, 16 @@ -69,8 +69,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -98,17 +98,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: srli a1, a0, 16 @@ -130,8 +130,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: lb a0, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) @@ -159,17 +159,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: srli a1, a0, 16 @@ -189,47 +189,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 1(a1) -; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a3, 0(a1) +; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 2(a1) ; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t1 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 @@ -262,17 +262,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a5, a1, a6 -; RV32I-NEXT: or a5, a5, a4 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a5, a1, a5 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: srl a1, a3, a5 ; RV32I-NEXT: bltz a4, .LBB3_2 @@ -322,47 +322,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 1(a1) -; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a3, 0(a1) +; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 2(a1) ; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t1 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 @@ -395,17 +395,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a5, a1, a6 -; RV32I-NEXT: or a5, a5, a4 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a5, a1, a5 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: sll a1, a3, a5 ; RV32I-NEXT: bltz a4, .LBB4_2 @@ -455,47 +455,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_8bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: lbu a3, 1(a1) -; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a3, 0(a1) +; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 2(a1) ; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: lbu a6, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a1, a1, t1 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 @@ -528,17 +528,17 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a4, a6, 24 ; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 1(a1) ; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: lbu a5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a5, a1, a5 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a5, a1, a6 ; RV32I-NEXT: addi a6, a5, -32 ; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 @@ -589,47 +589,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 13(a0) -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 1(a1) -; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 2(a1) ; RV64I-NEXT: lbu a7, 3(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu a7, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a1, a4 @@ -640,25 +640,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: -; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) ; RV64I-NEXT: lbu t0, 2(a0) ; RV64I-NEXT: lbu t1, 3(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: lbu t2, 4(a0) +; RV64I-NEXT: lbu t3, 5(a0) +; RV64I-NEXT: lbu t4, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a0) -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t4 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 @@ -710,36 +710,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 1(a1) -; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) +; RV32I-NEXT: lbu a7, 2(a0) +; RV32I-NEXT: lbu t0, 3(a0) +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t2, 5(a0) +; RV32I-NEXT: lbu t3, 6(a0) +; RV32I-NEXT: lbu t4, 7(a0) +; RV32I-NEXT: lbu t5, 8(a0) +; RV32I-NEXT: lbu t6, 9(a0) +; RV32I-NEXT: lbu s0, 10(a0) +; RV32I-NEXT: lbu s1, 11(a0) ; RV32I-NEXT: lbu s2, 12(a0) ; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu a0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: or a1, a1, s0 -; RV32I-NEXT: sb zero, 43(sp) -; RV32I-NEXT: sb zero, 42(sp) -; RV32I-NEXT: sb zero, 41(sp) -; RV32I-NEXT: sb zero, 40(sp) +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: sb zero, 39(sp) ; RV32I-NEXT: sb zero, 38(sp) ; RV32I-NEXT: sb zero, 37(sp) @@ -752,115 +749,120 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb zero, 30(sp) ; RV32I-NEXT: sb zero, 29(sp) ; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb s4, 26(sp) -; RV32I-NEXT: sb s3, 25(sp) -; RV32I-NEXT: sb s2, 24(sp) -; RV32I-NEXT: sb t6, 23(sp) -; RV32I-NEXT: sb t5, 22(sp) -; RV32I-NEXT: sb t4, 21(sp) -; RV32I-NEXT: sb t3, 20(sp) -; RV32I-NEXT: sb t2, 19(sp) -; RV32I-NEXT: sb t1, 18(sp) -; RV32I-NEXT: sb t0, 17(sp) -; RV32I-NEXT: sb a7, 16(sp) -; RV32I-NEXT: sb a6, 15(sp) -; RV32I-NEXT: sb a5, 14(sp) -; RV32I-NEXT: sb a4, 13(sp) -; RV32I-NEXT: sb a3, 12(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 12 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 -; RV32I-NEXT: sll a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: lbu t1, 13(a3) -; RV32I-NEXT: lbu t2, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a4 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: sb zero, 27(sp) +; RV32I-NEXT: sb zero, 26(sp) +; RV32I-NEXT: sb zero, 25(sp) +; RV32I-NEXT: sb zero, 24(sp) +; RV32I-NEXT: sb s5, 23(sp) +; RV32I-NEXT: sb s4, 22(sp) +; RV32I-NEXT: sb s3, 21(sp) +; RV32I-NEXT: sb s2, 20(sp) +; RV32I-NEXT: sb s1, 19(sp) +; RV32I-NEXT: sb s0, 18(sp) +; RV32I-NEXT: sb t6, 17(sp) +; RV32I-NEXT: sb t5, 16(sp) +; RV32I-NEXT: sb t4, 15(sp) +; RV32I-NEXT: sb t3, 14(sp) +; RV32I-NEXT: sb t2, 13(sp) +; RV32I-NEXT: sb t1, 12(sp) +; RV32I-NEXT: sb t0, 11(sp) +; RV32I-NEXT: sb a7, 10(sp) +; RV32I-NEXT: sb a6, 9(sp) +; RV32I-NEXT: sb a5, 8(sp) +; RV32I-NEXT: slli a1, a0, 25 +; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: addi a3, sp, 8 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a6, 3(a1) +; RV32I-NEXT: lbu a7, 4(a1) +; RV32I-NEXT: lbu t0, 5(a1) +; RV32I-NEXT: lbu t1, 6(a1) +; RV32I-NEXT: lbu t2, 7(a1) +; RV32I-NEXT: lbu t3, 8(a1) +; RV32I-NEXT: lbu t4, 9(a1) +; RV32I-NEXT: lbu t5, 10(a1) +; RV32I-NEXT: lbu t6, 11(a1) +; RV32I-NEXT: lbu s0, 12(a1) +; RV32I-NEXT: lbu s1, 13(a1) +; RV32I-NEXT: lbu s2, 14(a1) +; RV32I-NEXT: lbu a1, 15(a1) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: andi a0, a0, 7 +; RV32I-NEXT: srl t0, a7, a0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t2, t1, 1 +; RV32I-NEXT: not t3, a0 +; RV32I-NEXT: sll t2, t2, t3 +; RV32I-NEXT: or t2, t0, t2 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli a7, a7, 1 +; RV32I-NEXT: xori a4, a0, 31 +; RV32I-NEXT: sll a5, a7, a4 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: srl a6, t1, a0 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: or a1, a1, s0 +; RV32I-NEXT: slli a7, a1, 1 +; RV32I-NEXT: sll a4, a7, a4 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: srl a0, a1, a0 +; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb a3, 0(a2) +; RV32I-NEXT: sb t0, 4(a2) +; RV32I-NEXT: srli a1, a6, 16 +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: srli a1, a6, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a7, 8 ; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a0, t0, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, t0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 -; RV32I-NEXT: sb a0, 11(a2) +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: sb a4, 11(a2) ; RV32I-NEXT: srli a5, a5, 24 ; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a0, t2, 24 +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -872,47 +874,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t2, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a6, 6(a0) -; RV64I-NEXT: lbu a7, 7(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 1(a1) -; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 2(a1) ; RV64I-NEXT: lbu a7, 3(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu a7, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a1, a4 @@ -923,25 +925,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) +; RV64I-NEXT: lbu a6, 8(a0) +; RV64I-NEXT: lbu a7, 9(a0) ; RV64I-NEXT: lbu t0, 10(a0) ; RV64I-NEXT: lbu t1, 11(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: lbu t2, 12(a0) +; RV64I-NEXT: lbu t3, 13(a0) +; RV64I-NEXT: lbu t4, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: lbu t1, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t4 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 @@ -993,36 +995,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 1(a1) -; RV32I-NEXT: lbu s1, 0(a1) +; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) +; RV32I-NEXT: lbu a7, 2(a0) +; RV32I-NEXT: lbu t0, 3(a0) +; RV32I-NEXT: lbu t1, 4(a0) +; RV32I-NEXT: lbu t2, 5(a0) +; RV32I-NEXT: lbu t3, 6(a0) +; RV32I-NEXT: lbu t4, 7(a0) +; RV32I-NEXT: lbu t5, 8(a0) +; RV32I-NEXT: lbu t6, 9(a0) +; RV32I-NEXT: lbu s0, 10(a0) +; RV32I-NEXT: lbu s1, 11(a0) ; RV32I-NEXT: lbu s2, 12(a0) ; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu a0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: or a1, a1, s0 -; RV32I-NEXT: sb zero, 27(sp) -; RV32I-NEXT: sb zero, 26(sp) -; RV32I-NEXT: sb zero, 25(sp) -; RV32I-NEXT: sb zero, 24(sp) +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: sb zero, 23(sp) ; RV32I-NEXT: sb zero, 22(sp) ; RV32I-NEXT: sb zero, 21(sp) @@ -1035,115 +1034,120 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb zero, 14(sp) ; RV32I-NEXT: sb zero, 13(sp) ; RV32I-NEXT: sb zero, 12(sp) -; RV32I-NEXT: sb a0, 43(sp) -; RV32I-NEXT: sb s4, 42(sp) -; RV32I-NEXT: sb s3, 41(sp) -; RV32I-NEXT: sb s2, 40(sp) -; RV32I-NEXT: sb t6, 39(sp) -; RV32I-NEXT: sb t5, 38(sp) -; RV32I-NEXT: sb t4, 37(sp) -; RV32I-NEXT: sb t3, 36(sp) -; RV32I-NEXT: sb t2, 35(sp) -; RV32I-NEXT: sb t1, 34(sp) -; RV32I-NEXT: sb t0, 33(sp) -; RV32I-NEXT: sb a7, 32(sp) -; RV32I-NEXT: sb a6, 31(sp) -; RV32I-NEXT: sb a5, 30(sp) -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a3, 28(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 28 -; RV32I-NEXT: sub a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: sb zero, 11(sp) +; RV32I-NEXT: sb zero, 10(sp) +; RV32I-NEXT: sb zero, 9(sp) +; RV32I-NEXT: sb zero, 8(sp) +; RV32I-NEXT: sb s5, 39(sp) +; RV32I-NEXT: sb s4, 38(sp) +; RV32I-NEXT: sb s3, 37(sp) +; RV32I-NEXT: sb s2, 36(sp) +; RV32I-NEXT: sb s1, 35(sp) +; RV32I-NEXT: sb s0, 34(sp) +; RV32I-NEXT: sb t6, 33(sp) +; RV32I-NEXT: sb t5, 32(sp) +; RV32I-NEXT: sb t4, 31(sp) +; RV32I-NEXT: sb t3, 30(sp) +; RV32I-NEXT: sb t2, 29(sp) +; RV32I-NEXT: sb t1, 28(sp) +; RV32I-NEXT: sb t0, 27(sp) +; RV32I-NEXT: sb a7, 26(sp) +; RV32I-NEXT: sb a6, 25(sp) +; RV32I-NEXT: sb a5, 24(sp) +; RV32I-NEXT: slli a1, a0, 25 +; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: addi a3, sp, 24 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: lbu a1, 0(a3) +; RV32I-NEXT: lbu a4, 1(a3) +; RV32I-NEXT: lbu a5, 2(a3) +; RV32I-NEXT: lbu a6, 3(a3) +; RV32I-NEXT: lbu a7, 4(a3) +; RV32I-NEXT: lbu t0, 5(a3) +; RV32I-NEXT: lbu t1, 6(a3) +; RV32I-NEXT: lbu t2, 7(a3) +; RV32I-NEXT: lbu t3, 8(a3) +; RV32I-NEXT: lbu t4, 9(a3) +; RV32I-NEXT: lbu t5, 10(a3) +; RV32I-NEXT: lbu t6, 11(a3) +; RV32I-NEXT: lbu s0, 12(a3) +; RV32I-NEXT: lbu s1, 13(a3) +; RV32I-NEXT: lbu s2, 14(a3) +; RV32I-NEXT: lbu a3, 15(a3) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: andi a0, a0, 7 +; RV32I-NEXT: sll t0, a7, a0 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: sll a0, a5, a4 -; RV32I-NEXT: lbu a1, 1(a3) -; RV32I-NEXT: lbu a6, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t0, 3(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: xori a7, a4, 31 -; RV32I-NEXT: srl a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu t0, 13(a3) -; RV32I-NEXT: lbu t1, 12(a3) -; RV32I-NEXT: lbu t2, 14(a3) -; RV32I-NEXT: lbu t3, 15(a3) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t1, t3, t2 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a4 -; RV32I-NEXT: lbu t1, 9(a3) -; RV32I-NEXT: lbu t2, 8(a3) -; RV32I-NEXT: lbu t3, 10(a3) -; RV32I-NEXT: lbu a3, 11(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: srli a4, a1, 1 +; RV32I-NEXT: xori a5, a0, 31 +; RV32I-NEXT: srl a4, a4, a5 +; RV32I-NEXT: or a4, t0, a4 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: srli t1, a3, 1 -; RV32I-NEXT: srl a7, t1, a7 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: sll a3, a3, a4 -; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: not t1, a4 -; RV32I-NEXT: srl a5, a5, t1 +; RV32I-NEXT: or a3, a3, s2 +; RV32I-NEXT: or a3, a3, s0 +; RV32I-NEXT: sll a3, a3, a0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a6, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: srli t1, a6, 1 +; RV32I-NEXT: srl a5, t1, a5 ; RV32I-NEXT: or a5, a3, a5 -; RV32I-NEXT: sll a4, a6, a4 -; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: srli a6, a3, 16 -; RV32I-NEXT: sb a6, 10(a2) -; RV32I-NEXT: srli a6, a3, 24 -; RV32I-NEXT: sb a6, 11(a2) +; RV32I-NEXT: sll a6, a6, a0 +; RV32I-NEXT: srli a7, a7, 1 +; RV32I-NEXT: not t1, a0 +; RV32I-NEXT: srl a7, a7, t1 +; RV32I-NEXT: or a7, a6, a7 +; RV32I-NEXT: sll a0, a1, a0 +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: srli a1, a6, 16 +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: srli a1, a6, 24 +; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: srli a1, a6, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a3, 24 +; RV32I-NEXT: sb a1, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 9(a2) -; RV32I-NEXT: srli a3, t0, 16 -; RV32I-NEXT: sb a3, 14(a2) -; RV32I-NEXT: srli a3, t0, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a3, t0, 8 ; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a4, 24 -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 3(a2) ; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: srli a0, t0, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a0, t0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a5, 8(a2) -; RV32I-NEXT: sb a7, 12(a2) -; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a5, 12(a2) +; RV32I-NEXT: sb a4, 4(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1155,47 +1159,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 9(a0) -; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) ; RV64I-NEXT: lbu a5, 10(a0) ; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 13(a0) -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a5, a4, 32 ; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: lbu a5, 1(a1) -; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a5, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) ; RV64I-NEXT: lbu a7, 2(a1) ; RV64I-NEXT: lbu t0, 3(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: lbu t3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 5(a1) -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a1, a5 @@ -1208,25 +1212,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a6, 1(a0) ; RV64I-NEXT: lbu a7, 2(a0) ; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a6 +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) +; RV64I-NEXT: lbu t3, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t0, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t3 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a4 @@ -1277,163 +1281,165 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 15(a0) -; RV32I-NEXT: slli a4, a3, 24 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t2, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 1(a1) -; RV32I-NEXT: lbu s2, 0(a1) -; RV32I-NEXT: lbu s3, 11(a0) -; RV32I-NEXT: lbu s4, 12(a0) -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or s1, s1, s2 -; RV32I-NEXT: lbu s2, 2(a1) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a6, 2(a0) +; RV32I-NEXT: lbu a7, 3(a0) +; RV32I-NEXT: lbu t0, 4(a0) +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu s1, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s3, 14(a0) +; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: lbu s4, 1(a1) +; RV32I-NEXT: slli s5, a3, 24 +; RV32I-NEXT: lbu s6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: lbu s5, 13(a0) -; RV32I-NEXT: lbu a0, 14(a0) -; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: or a0, s4, a0 +; RV32I-NEXT: slli s6, s6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s2 -; RV32I-NEXT: or a1, a1, s1 +; RV32I-NEXT: or a1, a1, s6 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: sb a3, 19(sp) +; RV32I-NEXT: sb s3, 18(sp) +; RV32I-NEXT: sb s2, 17(sp) +; RV32I-NEXT: sb s1, 16(sp) +; RV32I-NEXT: sb s0, 15(sp) +; RV32I-NEXT: sb t6, 14(sp) +; RV32I-NEXT: sb t5, 13(sp) +; RV32I-NEXT: sb t4, 12(sp) +; RV32I-NEXT: sb t3, 11(sp) +; RV32I-NEXT: sb t2, 10(sp) +; RV32I-NEXT: sb t1, 9(sp) +; RV32I-NEXT: sb t0, 8(sp) +; RV32I-NEXT: sb a7, 7(sp) +; RV32I-NEXT: sb a6, 6(sp) +; RV32I-NEXT: sb a5, 5(sp) +; RV32I-NEXT: sb a4, 4(sp) +; RV32I-NEXT: srai a1, s5, 31 +; RV32I-NEXT: sb a1, 32(sp) +; RV32I-NEXT: sb a1, 28(sp) +; RV32I-NEXT: sb a1, 24(sp) +; RV32I-NEXT: sb a1, 20(sp) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 35(sp) +; RV32I-NEXT: srli a4, a1, 16 +; RV32I-NEXT: sb a4, 34(sp) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 33(sp) +; RV32I-NEXT: sb a3, 31(sp) +; RV32I-NEXT: sb a4, 30(sp) +; RV32I-NEXT: sb a1, 29(sp) +; RV32I-NEXT: sb a3, 27(sp) +; RV32I-NEXT: sb a4, 26(sp) +; RV32I-NEXT: sb a1, 25(sp) ; RV32I-NEXT: sb a3, 23(sp) -; RV32I-NEXT: sb a0, 22(sp) -; RV32I-NEXT: sb s5, 21(sp) -; RV32I-NEXT: sb s4, 20(sp) -; RV32I-NEXT: sb s3, 19(sp) -; RV32I-NEXT: sb s0, 18(sp) -; RV32I-NEXT: sb t6, 17(sp) -; RV32I-NEXT: sb t5, 16(sp) -; RV32I-NEXT: sb t4, 15(sp) -; RV32I-NEXT: sb t3, 14(sp) -; RV32I-NEXT: sb t2, 13(sp) -; RV32I-NEXT: sb t1, 12(sp) -; RV32I-NEXT: sb t0, 11(sp) -; RV32I-NEXT: sb a7, 10(sp) -; RV32I-NEXT: sb a6, 9(sp) -; RV32I-NEXT: sb a5, 8(sp) -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: sb a4, 36(sp) -; RV32I-NEXT: sb a4, 32(sp) -; RV32I-NEXT: sb a4, 28(sp) -; RV32I-NEXT: sb a4, 24(sp) -; RV32I-NEXT: srli a0, a4, 24 -; RV32I-NEXT: sb a0, 39(sp) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 38(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 37(sp) -; RV32I-NEXT: sb a0, 35(sp) -; RV32I-NEXT: sb a3, 34(sp) -; RV32I-NEXT: sb a4, 33(sp) -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 8 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 -; RV32I-NEXT: sll a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: sb a4, 22(sp) +; RV32I-NEXT: sb a1, 21(sp) +; RV32I-NEXT: slli a1, a0, 25 +; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: addi a3, sp, 4 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: lbu a6, 3(a1) +; RV32I-NEXT: lbu a7, 4(a1) +; RV32I-NEXT: lbu t0, 5(a1) +; RV32I-NEXT: lbu t1, 6(a1) +; RV32I-NEXT: lbu t2, 7(a1) +; RV32I-NEXT: lbu t3, 8(a1) +; RV32I-NEXT: lbu t4, 9(a1) +; RV32I-NEXT: lbu t5, 10(a1) +; RV32I-NEXT: lbu t6, 11(a1) +; RV32I-NEXT: lbu s0, 12(a1) +; RV32I-NEXT: lbu s1, 13(a1) +; RV32I-NEXT: lbu s2, 14(a1) +; RV32I-NEXT: lbu a1, 15(a1) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a7, t0, a7 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: lbu t1, 13(a3) -; RV32I-NEXT: lbu t2, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a4 +; RV32I-NEXT: andi a0, a0, 7 +; RV32I-NEXT: srl t0, a7, a0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t2, t6, t5 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t2, t1, 1 +; RV32I-NEXT: not t3, a0 +; RV32I-NEXT: sll t2, t2, t3 +; RV32I-NEXT: or t2, t0, t2 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli a7, a7, 1 +; RV32I-NEXT: xori a4, a0, 31 +; RV32I-NEXT: sll a5, a7, a4 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: srl a6, t1, a0 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, s2 +; RV32I-NEXT: or a1, a1, s0 +; RV32I-NEXT: slli a7, a1, 1 +; RV32I-NEXT: sll a4, a7, a4 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: sra a0, a1, a0 ; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb a3, 0(a2) +; RV32I-NEXT: sb t0, 4(a2) +; RV32I-NEXT: srli a1, a6, 16 +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: srli a1, a6, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a7, 8 ; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: srli a0, t0, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, t0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 -; RV32I-NEXT: sb a0, 11(a2) +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: sb a4, 11(a2) ; RV32I-NEXT: srli a5, a5, 24 ; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a0, t2, 24 +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 36(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1460,18 +1466,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu s2, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: lbu a3, 0(a1) +; RV64I-NEXT: lbu a4, 1(a1) +; RV64I-NEXT: lbu a5, 2(a1) +; RV64I-NEXT: lbu a6, 3(a1) +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a5, a1, a3 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -1480,69 +1511,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: lbu s0, 12(a0) ; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) -; RV64I-NEXT: lbu a7, 25(a0) -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu a5, 27(a0) -; RV64I-NEXT: lbu a1, 31(a0) -; RV64I-NEXT: lbu a3, 30(a0) -; RV64I-NEXT: lbu a4, 29(a0) +; RV64I-NEXT: lbu s3, 14(a0) +; RV64I-NEXT: lbu s5, 15(a0) +; RV64I-NEXT: lbu s7, 16(a0) +; RV64I-NEXT: lbu s8, 17(a0) +; RV64I-NEXT: lbu s9, 18(a0) +; RV64I-NEXT: lbu s10, 19(a0) +; RV64I-NEXT: lbu s11, 20(a0) +; RV64I-NEXT: lbu ra, 21(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu a7, 23(a0) +; RV64I-NEXT: lbu a6, 24(a0) +; RV64I-NEXT: lbu a4, 25(a0) +; RV64I-NEXT: lbu a3, 26(a0) +; RV64I-NEXT: lbu a1, 27(a0) ; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: sb a1, 87(sp) -; RV64I-NEXT: sb a3, 86(sp) -; RV64I-NEXT: sb a4, 85(sp) +; RV64I-NEXT: sb s6, 87(sp) +; RV64I-NEXT: sb s4, 86(sp) +; RV64I-NEXT: sb s2, 85(sp) ; RV64I-NEXT: sb a0, 84(sp) -; RV64I-NEXT: sb a5, 83(sp) -; RV64I-NEXT: sb a6, 82(sp) -; RV64I-NEXT: sb a7, 81(sp) -; RV64I-NEXT: sb s11, 80(sp) -; RV64I-NEXT: sb s10, 79(sp) -; RV64I-NEXT: sb ra, 78(sp) -; RV64I-NEXT: sb s9, 77(sp) -; RV64I-NEXT: sb s8, 76(sp) -; RV64I-NEXT: sb s7, 75(sp) -; RV64I-NEXT: sb s6, 74(sp) -; RV64I-NEXT: sb s5, 73(sp) -; RV64I-NEXT: sb s4, 72(sp) -; RV64I-NEXT: sb s3, 71(sp) -; RV64I-NEXT: sb s2, 70(sp) -; RV64I-NEXT: sb s1, 69(sp) -; RV64I-NEXT: sb s0, 68(sp) -; RV64I-NEXT: sb t6, 67(sp) -; RV64I-NEXT: sb t5, 66(sp) -; RV64I-NEXT: sb t4, 65(sp) +; RV64I-NEXT: sb a1, 83(sp) +; RV64I-NEXT: sb a3, 82(sp) +; RV64I-NEXT: sb a4, 81(sp) ; RV64I-NEXT: sb zero, 119(sp) ; RV64I-NEXT: sb zero, 118(sp) ; RV64I-NEXT: sb zero, 117(sp) @@ -1575,6 +1565,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb zero, 90(sp) ; RV64I-NEXT: sb zero, 89(sp) ; RV64I-NEXT: sb zero, 88(sp) +; RV64I-NEXT: sb a6, 80(sp) +; RV64I-NEXT: sb a7, 79(sp) +; RV64I-NEXT: sb t0, 78(sp) +; RV64I-NEXT: sb ra, 77(sp) +; RV64I-NEXT: sb s11, 76(sp) +; RV64I-NEXT: sb s10, 75(sp) +; RV64I-NEXT: sb s9, 74(sp) +; RV64I-NEXT: sb s8, 73(sp) +; RV64I-NEXT: sb s7, 72(sp) +; RV64I-NEXT: sb s5, 71(sp) +; RV64I-NEXT: sb s3, 70(sp) +; RV64I-NEXT: sb s1, 69(sp) +; RV64I-NEXT: sb s0, 68(sp) +; RV64I-NEXT: sb t6, 67(sp) +; RV64I-NEXT: sb t5, 66(sp) +; RV64I-NEXT: sb t4, 65(sp) ; RV64I-NEXT: sb t3, 64(sp) ; RV64I-NEXT: sb t2, 63(sp) ; RV64I-NEXT: sb t1, 62(sp) @@ -1590,111 +1596,112 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: slli a0, t0, 56 +; RV64I-NEXT: slli a0, a5, 56 +; RV64I-NEXT: mv s11, a5 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: lbu t3, 16(a0) +; RV64I-NEXT: lbu t4, 17(a0) +; RV64I-NEXT: lbu t5, 18(a0) +; RV64I-NEXT: lbu t6, 19(a0) +; RV64I-NEXT: lbu s0, 20(a0) +; RV64I-NEXT: lbu a1, 21(a0) +; RV64I-NEXT: lbu s1, 22(a0) +; RV64I-NEXT: lbu s2, 23(a0) +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s4, 25(a0) +; RV64I-NEXT: lbu s5, 26(a0) +; RV64I-NEXT: lbu s6, 27(a0) +; RV64I-NEXT: lbu s7, 28(a0) +; RV64I-NEXT: lbu s8, 29(a0) +; RV64I-NEXT: lbu s9, 30(a0) +; RV64I-NEXT: lbu s10, 31(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 -; RV64I-NEXT: andi a1, t0, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a4, a4, a3 +; RV64I-NEXT: andi a3, s11, 7 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a5, t4, t3 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a6, t6, t5 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: or a6, s2, s1 +; RV64I-NEXT: or a1, a6, a1 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: slli a1, a5, 1 +; RV64I-NEXT: not a6, a3 +; RV64I-NEXT: sll a1, a1, a6 +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu t2, 4(a0) +; RV64I-NEXT: lbu t3, 5(a0) +; RV64I-NEXT: lbu t4, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or t0, t0, t1 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t4 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a6, a0, a6 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s4, s3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: or a7, s6, s5 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or a7, s8, s7 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or t0, s10, s9 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 -; RV64I-NEXT: xori t0, a1, 63 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: xori t0, a3, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a7, a3, a7 -; RV64I-NEXT: slli a3, a7, 1 -; RV64I-NEXT: sll t0, a3, t0 -; RV64I-NEXT: srl a3, a4, a1 -; RV64I-NEXT: srl a4, a6, a1 -; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: srl a1, a7, a1 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: or a7, a7, a0 +; RV64I-NEXT: slli a0, a7, 1 +; RV64I-NEXT: sll t0, a0, t0 +; RV64I-NEXT: srl a0, a4, a3 +; RV64I-NEXT: srl a4, a6, a3 +; RV64I-NEXT: srl a5, a5, a3 +; RV64I-NEXT: srl a3, a7, a3 ; RV64I-NEXT: srli a6, a5, 48 ; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: srli a6, a5, 40 @@ -1709,55 +1716,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: srli a5, a3, 56 ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: srli a5, a3, 40 ; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srli a5, a3, 32 ; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: srli a5, a3, 24 ; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: srli a5, a3, 16 ; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a4, 48 -; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a4, 40 -; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a4, 32 -; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a4, 24 -; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a4, 16 -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a4, t1 +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, a4, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a4, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: or a3, a4, t1 ; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: srli a4, a3, 48 +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: srli a4, a3, 40 +; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: srli a4, a3, 32 +; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: sb a4, 12(a2) -; RV64I-NEXT: srli a4, a3, 24 +; RV64I-NEXT: srli a4, a0, 24 ; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: srli a4, a3, 16 +; RV64I-NEXT: srli a4, a0, 16 ; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: sb a3, 8(a2) -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 9(a2) -; RV64I-NEXT: srli a3, a6, 56 -; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: srli a0, a6, 56 +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: srli a3, a3, 56 +; RV64I-NEXT: sb a3, 7(a2) ; RV64I-NEXT: srli a1, a1, 56 -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) +; RV64I-NEXT: sb a1, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -1790,19 +1797,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu a5, 30(a0) +; RV32I-NEXT: lbu a4, 31(a0) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a7, a1, a6 +; RV32I-NEXT: sw a7, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 0(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) ; RV32I-NEXT: lbu t4, 9(a0) @@ -1816,44 +1837,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) ; RV32I-NEXT: lbu s8, 20(a0) ; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 -; RV32I-NEXT: lbu s11, 22(a0) -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) -; RV32I-NEXT: lbu a7, 25(a0) -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu a5, 27(a0) -; RV32I-NEXT: lbu a1, 31(a0) -; RV32I-NEXT: lbu a3, 30(a0) -; RV32I-NEXT: lbu a4, 29(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: lbu ra, 24(a0) +; RV32I-NEXT: lbu a6, 25(a0) +; RV32I-NEXT: lbu a3, 26(a0) +; RV32I-NEXT: lbu a1, 27(a0) ; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: sb a1, 59(sp) -; RV32I-NEXT: sb a3, 58(sp) -; RV32I-NEXT: sb a4, 57(sp) +; RV32I-NEXT: sb a4, 59(sp) +; RV32I-NEXT: sb a5, 58(sp) +; RV32I-NEXT: sb t1, 57(sp) ; RV32I-NEXT: sb a0, 56(sp) -; RV32I-NEXT: sb a5, 55(sp) -; RV32I-NEXT: sb a6, 54(sp) -; RV32I-NEXT: sb a7, 53(sp) -; RV32I-NEXT: sb s10, 52(sp) -; RV32I-NEXT: sb ra, 51(sp) -; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s9, 49(sp) -; RV32I-NEXT: sb s8, 48(sp) -; RV32I-NEXT: sb s7, 47(sp) -; RV32I-NEXT: sb s6, 46(sp) -; RV32I-NEXT: sb s5, 45(sp) -; RV32I-NEXT: sb s4, 44(sp) +; RV32I-NEXT: sb a1, 55(sp) +; RV32I-NEXT: sb a3, 54(sp) ; RV32I-NEXT: sb zero, 91(sp) ; RV32I-NEXT: sb zero, 90(sp) ; RV32I-NEXT: sb zero, 89(sp) @@ -1886,6 +1884,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb zero, 62(sp) ; RV32I-NEXT: sb zero, 61(sp) ; RV32I-NEXT: sb zero, 60(sp) +; RV32I-NEXT: sb a6, 53(sp) +; RV32I-NEXT: sb ra, 52(sp) +; RV32I-NEXT: sb s11, 51(sp) +; RV32I-NEXT: sb s10, 50(sp) +; RV32I-NEXT: sb s9, 49(sp) +; RV32I-NEXT: sb s8, 48(sp) +; RV32I-NEXT: sb s7, 47(sp) +; RV32I-NEXT: sb s6, 46(sp) +; RV32I-NEXT: sb s5, 45(sp) +; RV32I-NEXT: sb s4, 44(sp) ; RV32I-NEXT: sb s3, 43(sp) ; RV32I-NEXT: sb s2, 42(sp) ; RV32I-NEXT: sb s1, 41(sp) @@ -1895,188 +1903,193 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb t4, 37(sp) ; RV32I-NEXT: sb t3, 36(sp) ; RV32I-NEXT: sb t2, 35(sp) -; RV32I-NEXT: sb t1, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb t0, 34(sp) +; RV32I-NEXT: lw a0, 0(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: slli a0, t0, 24 -; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a3, t0, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) -; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 28(sp) +; RV32I-NEXT: slli a0, a7, 24 +; RV32I-NEXT: srli a0, a0, 27 +; RV32I-NEXT: addi a1, sp, 28 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a7, 6(a0) +; RV32I-NEXT: lbu a3, 7(a0) +; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 8(a0) +; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t0, 9(a0) +; RV32I-NEXT: lbu t1, 10(a0) +; RV32I-NEXT: lbu t2, 11(a0) +; RV32I-NEXT: lbu t3, 12(a0) +; RV32I-NEXT: lbu t4, 13(a0) +; RV32I-NEXT: lbu t5, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu s0, 16(a0) +; RV32I-NEXT: lbu s1, 17(a0) +; RV32I-NEXT: lbu s2, 18(a0) +; RV32I-NEXT: lbu s3, 19(a0) +; RV32I-NEXT: lbu a6, 20(a0) +; RV32I-NEXT: lbu s5, 21(a0) +; RV32I-NEXT: lbu s6, 22(a0) +; RV32I-NEXT: lbu s7, 23(a0) +; RV32I-NEXT: lbu s4, 24(a0) +; RV32I-NEXT: lbu s8, 25(a0) +; RV32I-NEXT: lbu s9, 26(a0) +; RV32I-NEXT: lbu s10, 27(a0) +; RV32I-NEXT: lbu s11, 28(a0) +; RV32I-NEXT: lbu a5, 29(a0) +; RV32I-NEXT: lbu ra, 30(a0) +; RV32I-NEXT: lbu a3, 31(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a1 +; RV32I-NEXT: slli a1, a7, 16 +; RV32I-NEXT: lw a7, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a4, a1, a4 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, t0, a1 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu t0, 0(a0) +; RV32I-NEXT: lbu t1, 1(a0) +; RV32I-NEXT: or a7, a7, a1 +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 -; RV32I-NEXT: or a6, a1, a0 -; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: or a1, a0, t0 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a0, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: or t1, t0, a0 +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a0, s3, s2 +; RV32I-NEXT: or s0, a0, s0 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or a0, s5, a6 +; RV32I-NEXT: lw a6, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: andi t2, a6, 7 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: or a6, s7, s6 +; RV32I-NEXT: slli t0, a7, 1 +; RV32I-NEXT: or t3, a6, a0 +; RV32I-NEXT: not t4, t2 +; RV32I-NEXT: sll a0, t0, t4 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or t0, s8, s4 +; RV32I-NEXT: slli t5, a4, 1 +; RV32I-NEXT: slli s9, s9, 16 +; RV32I-NEXT: slli s10, s10, 24 +; RV32I-NEXT: or t6, s10, s9 +; RV32I-NEXT: slli a6, s0, 1 +; RV32I-NEXT: sll a6, a6, t4 +; RV32I-NEXT: or t6, t6, t0 +; RV32I-NEXT: slli t0, t6, 1 +; RV32I-NEXT: sll t4, t0, t4 +; RV32I-NEXT: xori s1, t2, 31 +; RV32I-NEXT: sll t0, t5, s1 ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) +; RV32I-NEXT: or a5, a5, s11 +; RV32I-NEXT: slli t5, t1, 1 +; RV32I-NEXT: sll t5, t5, s1 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a3, a3, ra ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 -; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: srl a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: sll s2, s2, s1 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a5, a3, 1 +; RV32I-NEXT: sll a5, a5, s1 +; RV32I-NEXT: srl a4, a4, t2 +; RV32I-NEXT: srl a1, a1, t2 +; RV32I-NEXT: srl t1, t1, t2 +; RV32I-NEXT: srl a7, a7, t2 +; RV32I-NEXT: srl t3, t3, t2 +; RV32I-NEXT: srl s0, s0, t2 +; RV32I-NEXT: srl t6, t6, t2 +; RV32I-NEXT: srl a3, a3, t2 +; RV32I-NEXT: srli t2, t6, 16 +; RV32I-NEXT: sb t2, 26(a2) +; RV32I-NEXT: or a5, t6, a5 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: srli t2, t6, 8 +; RV32I-NEXT: sb t2, 25(a2) +; RV32I-NEXT: srli t2, a3, 24 +; RV32I-NEXT: sb t2, 31(a2) +; RV32I-NEXT: srli t2, a3, 16 +; RV32I-NEXT: sb t2, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, t4, 16 +; RV32I-NEXT: srli a3, s0, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or a3, t4, s1 -; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: or a3, s0, s2 +; RV32I-NEXT: sb s0, 16(a2) +; RV32I-NEXT: srli s0, s0, 8 +; RV32I-NEXT: sb s0, 17(a2) +; RV32I-NEXT: srli t2, t3, 16 +; RV32I-NEXT: sb t2, 22(a2) +; RV32I-NEXT: or t2, t3, t4 ; RV32I-NEXT: sb t3, 20(a2) ; RV32I-NEXT: srli t3, t3, 8 ; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: srli t3, a7, 16 ; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 +; RV32I-NEXT: or t3, a7, t5 +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: srli a7, a7, 8 +; RV32I-NEXT: sb a7, 9(a2) +; RV32I-NEXT: srli a7, t1, 16 +; RV32I-NEXT: sb a7, 14(a2) +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: sb t1, 12(a2) +; RV32I-NEXT: srli a7, t1, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 +; RV32I-NEXT: srli a7, a1, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a7, a1, t0 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a4, 16 +; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: sb a4, 4(a2) ; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 27(a2) ; RV32I-NEXT: srli a3, a3, 24 ; RV32I-NEXT: sb a3, 19(a2) -; RV32I-NEXT: srli a3, t1, 24 -; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 -; RV32I-NEXT: sb a3, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: srli a1, t2, 24 +; RV32I-NEXT: sb a1, 23(a2) +; RV32I-NEXT: srli a1, t3, 24 +; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: srli a1, a6, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a1, a7, 24 ; RV32I-NEXT: sb a1, 3(a2) ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: sb a0, 7(a2) @@ -2118,18 +2131,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu s2, 29(a0) +; RV64I-NEXT: lbu s4, 30(a0) +; RV64I-NEXT: lbu s6, 31(a0) +; RV64I-NEXT: lbu a3, 0(a1) +; RV64I-NEXT: lbu a4, 1(a1) +; RV64I-NEXT: lbu a5, 2(a1) +; RV64I-NEXT: lbu a6, 3(a1) +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or s11, a1, a3 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 4(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 5(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu t2, 7(a0) ; RV64I-NEXT: lbu t3, 8(a0) @@ -2138,70 +2176,28 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: lbu s0, 12(a0) ; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) -; RV64I-NEXT: lbu a7, 25(a0) -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu a5, 27(a0) -; RV64I-NEXT: lbu a1, 31(a0) -; RV64I-NEXT: lbu a3, 30(a0) -; RV64I-NEXT: lbu a4, 29(a0) +; RV64I-NEXT: lbu s3, 14(a0) +; RV64I-NEXT: lbu s5, 15(a0) +; RV64I-NEXT: lbu s7, 16(a0) +; RV64I-NEXT: lbu s8, 17(a0) +; RV64I-NEXT: lbu s9, 18(a0) +; RV64I-NEXT: lbu s10, 19(a0) +; RV64I-NEXT: lbu t0, 20(a0) +; RV64I-NEXT: lbu ra, 21(a0) +; RV64I-NEXT: lbu a7, 22(a0) +; RV64I-NEXT: lbu a6, 23(a0) +; RV64I-NEXT: lbu a5, 24(a0) +; RV64I-NEXT: lbu a4, 25(a0) +; RV64I-NEXT: lbu a3, 26(a0) +; RV64I-NEXT: lbu a1, 27(a0) ; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: sb a1, 119(sp) -; RV64I-NEXT: sb a3, 118(sp) -; RV64I-NEXT: sb a4, 117(sp) +; RV64I-NEXT: sb s6, 119(sp) +; RV64I-NEXT: sb s4, 118(sp) +; RV64I-NEXT: sb s2, 117(sp) ; RV64I-NEXT: sb a0, 116(sp) -; RV64I-NEXT: sb a5, 115(sp) -; RV64I-NEXT: sb a6, 114(sp) -; RV64I-NEXT: sb a7, 113(sp) -; RV64I-NEXT: sb s11, 112(sp) -; RV64I-NEXT: sb s10, 111(sp) -; RV64I-NEXT: sb ra, 110(sp) -; RV64I-NEXT: sb s9, 109(sp) -; RV64I-NEXT: sb s8, 108(sp) -; RV64I-NEXT: sb s7, 107(sp) -; RV64I-NEXT: sb s6, 106(sp) -; RV64I-NEXT: sb s5, 105(sp) -; RV64I-NEXT: sb s4, 104(sp) -; RV64I-NEXT: sb s3, 103(sp) -; RV64I-NEXT: sb s2, 102(sp) -; RV64I-NEXT: sb s1, 101(sp) -; RV64I-NEXT: sb s0, 100(sp) -; RV64I-NEXT: sb t6, 99(sp) -; RV64I-NEXT: sb t5, 98(sp) -; RV64I-NEXT: sb t4, 97(sp) -; RV64I-NEXT: sb t3, 96(sp) +; RV64I-NEXT: sb a1, 115(sp) +; RV64I-NEXT: sb a3, 114(sp) +; RV64I-NEXT: sb a4, 113(sp) ; RV64I-NEXT: sb zero, 87(sp) ; RV64I-NEXT: sb zero, 86(sp) ; RV64I-NEXT: sb zero, 85(sp) @@ -2234,6 +2230,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb zero, 58(sp) ; RV64I-NEXT: sb zero, 57(sp) ; RV64I-NEXT: sb zero, 56(sp) +; RV64I-NEXT: sb a5, 112(sp) +; RV64I-NEXT: sb a6, 111(sp) +; RV64I-NEXT: sb a7, 110(sp) +; RV64I-NEXT: sb ra, 109(sp) +; RV64I-NEXT: sb t0, 108(sp) +; RV64I-NEXT: sb s10, 107(sp) +; RV64I-NEXT: sb s9, 106(sp) +; RV64I-NEXT: sb s8, 105(sp) +; RV64I-NEXT: sb s7, 104(sp) +; RV64I-NEXT: sb s5, 103(sp) +; RV64I-NEXT: sb s3, 102(sp) +; RV64I-NEXT: sb s1, 101(sp) +; RV64I-NEXT: sb s0, 100(sp) +; RV64I-NEXT: sb t6, 99(sp) +; RV64I-NEXT: sb t5, 98(sp) +; RV64I-NEXT: sb t4, 97(sp) +; RV64I-NEXT: sb t3, 96(sp) ; RV64I-NEXT: sb t2, 95(sp) ; RV64I-NEXT: sb t1, 94(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload @@ -2248,173 +2261,173 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 89(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: slli a0, t0, 56 +; RV64I-NEXT: slli a0, s11, 56 ; RV64I-NEXT: srli a0, a0, 59 ; RV64I-NEXT: addi a1, sp, 88 ; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: lbu a1, 9(a0) ; RV64I-NEXT: lbu a3, 8(a0) -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a5, 11(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: lbu a3, 13(a0) -; RV64I-NEXT: lbu a4, 12(a0) -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu a6, 15(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: lbu a1, 16(a0) +; RV64I-NEXT: lbu t3, 17(a0) +; RV64I-NEXT: lbu t4, 18(a0) +; RV64I-NEXT: lbu t6, 19(a0) +; RV64I-NEXT: lbu t5, 20(a0) +; RV64I-NEXT: lbu s0, 21(a0) +; RV64I-NEXT: lbu s1, 22(a0) +; RV64I-NEXT: lbu s2, 23(a0) +; RV64I-NEXT: lbu s3, 24(a0) +; RV64I-NEXT: lbu s4, 25(a0) +; RV64I-NEXT: lbu s5, 26(a0) +; RV64I-NEXT: lbu s6, 27(a0) +; RV64I-NEXT: lbu s7, 28(a0) +; RV64I-NEXT: lbu s8, 29(a0) +; RV64I-NEXT: lbu s9, 30(a0) +; RV64I-NEXT: lbu s10, 31(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a1 -; RV64I-NEXT: andi a1, t0, 7 -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 1(a0) ; RV64I-NEXT: lbu a6, 2(a0) ; RV64I-NEXT: lbu a7, 3(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 5(a0) -; RV64I-NEXT: lbu a6, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu t0, 7(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or a5, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a4, a0, a4 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s4, s3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: or a5, s6, s5 +; RV64I-NEXT: or a0, a5, a0 +; RV64I-NEXT: slli s8, s8, 8 +; RV64I-NEXT: or a5, s8, s7 +; RV64I-NEXT: slli s9, s9, 16 +; RV64I-NEXT: slli s10, s10, 24 +; RV64I-NEXT: or a6, s10, s9 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 25(a0) -; RV64I-NEXT: lbu a6, 24(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu t0, 27(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 29(a0) -; RV64I-NEXT: lbu a7, 28(a0) -; RV64I-NEXT: lbu t0, 30(a0) -; RV64I-NEXT: lbu t1, 31(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu a7, 16(a0) -; RV64I-NEXT: lbu t0, 18(a0) -; RV64I-NEXT: lbu t1, 19(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: lbu a7, 21(a0) -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: lbu t0, 20(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t1, 22(a0) -; RV64I-NEXT: lbu a0, 23(a0) -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: srli t0, a4, 1 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or t1, a0, t1 -; RV64I-NEXT: xori t2, a1, 63 -; RV64I-NEXT: srl a0, t0, t2 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or a5, a5, a0 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a0, t3, a1 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a1, t6, t4 +; RV64I-NEXT: or a1, a1, a0 +; RV64I-NEXT: andi a6, s11, 7 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: or a7, s0, t5 +; RV64I-NEXT: srli a0, a4, 1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: or t0, s2, s1 +; RV64I-NEXT: xori t1, a6, 63 +; RV64I-NEXT: srl a0, a0, t1 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: srli a7, a6, 1 -; RV64I-NEXT: srl a7, a7, t2 -; RV64I-NEXT: srli t0, a3, 1 -; RV64I-NEXT: not t1, a1 -; RV64I-NEXT: srl t0, t0, t1 -; RV64I-NEXT: sll a3, a3, a1 -; RV64I-NEXT: sll a5, a5, a1 -; RV64I-NEXT: sll a6, a6, a1 -; RV64I-NEXT: sll a1, a4, a1 -; RV64I-NEXT: srli a4, a6, 56 -; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a6, 48 -; RV64I-NEXT: sb a4, 22(a2) -; RV64I-NEXT: srli a4, a6, 40 -; RV64I-NEXT: sb a4, 21(a2) -; RV64I-NEXT: srli a4, a6, 32 -; RV64I-NEXT: sb a4, 20(a2) -; RV64I-NEXT: srli a4, a6, 24 -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a4, a6, 16 -; RV64I-NEXT: sb a4, 18(a2) -; RV64I-NEXT: or a4, a6, t0 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: or a7, a7, a1 +; RV64I-NEXT: srli a1, a7, 1 +; RV64I-NEXT: srl t0, a1, t1 +; RV64I-NEXT: srli a1, a3, 1 +; RV64I-NEXT: not t1, a6 +; RV64I-NEXT: srl t1, a1, t1 +; RV64I-NEXT: sll a1, a3, a6 +; RV64I-NEXT: sll a3, a5, a6 +; RV64I-NEXT: sll a5, a7, a6 +; RV64I-NEXT: sll a4, a4, a6 ; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: sb a6, 31(a2) +; RV64I-NEXT: sb a6, 23(a2) ; RV64I-NEXT: srli a6, a5, 48 -; RV64I-NEXT: sb a6, 30(a2) +; RV64I-NEXT: sb a6, 22(a2) ; RV64I-NEXT: srli a6, a5, 40 -; RV64I-NEXT: sb a6, 29(a2) +; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: srli a6, a5, 32 -; RV64I-NEXT: sb a6, 28(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: sb a6, 27(a2) +; RV64I-NEXT: sb a6, 19(a2) ; RV64I-NEXT: srli a6, a5, 16 -; RV64I-NEXT: sb a6, 26(a2) -; RV64I-NEXT: or a6, a5, a7 +; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: or a6, a5, t1 ; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 6(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 2(a2) -; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 1(a2) -; RV64I-NEXT: srli a1, a3, 56 -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a3, 48 -; RV64I-NEXT: sb a1, 14(a2) -; RV64I-NEXT: srli a1, a3, 40 -; RV64I-NEXT: sb a1, 13(a2) -; RV64I-NEXT: srli a1, a3, 32 -; RV64I-NEXT: sb a1, 12(a2) -; RV64I-NEXT: srli a1, a3, 24 -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: srli a1, a3, 16 -; RV64I-NEXT: sb a1, 10(a2) -; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sb a5, 17(a2) +; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: sb a5, 31(a2) +; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: sb a5, 30(a2) +; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: sb a5, 29(a2) +; RV64I-NEXT: srli a5, a3, 32 +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: srli a5, a3, 24 +; RV64I-NEXT: sb a5, 27(a2) +; RV64I-NEXT: srli a5, a3, 16 +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: or a5, a3, t0 ; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 9(a2) -; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a6, 24(a2) +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, a4, 56 +; RV64I-NEXT: sb a3, 7(a2) +; RV64I-NEXT: srli a3, a4, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a4, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 15(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 14(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 12(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: sb a5, 24(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload @@ -2448,19 +2461,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu a5, 30(a0) +; RV32I-NEXT: lbu a4, 31(a0) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a7, a1, a6 +; RV32I-NEXT: sw a7, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 5(a0) +; RV32I-NEXT: sw a1, 0(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t3, 8(a0) ; RV32I-NEXT: lbu t4, 9(a0) @@ -2474,44 +2501,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) ; RV32I-NEXT: lbu s8, 20(a0) ; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 -; RV32I-NEXT: lbu s11, 22(a0) -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) -; RV32I-NEXT: lbu a7, 25(a0) -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu a5, 27(a0) -; RV32I-NEXT: lbu a1, 31(a0) -; RV32I-NEXT: lbu a3, 30(a0) -; RV32I-NEXT: lbu a4, 29(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: lbu ra, 24(a0) +; RV32I-NEXT: lbu a6, 25(a0) +; RV32I-NEXT: lbu a3, 26(a0) +; RV32I-NEXT: lbu a1, 27(a0) ; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: sb a1, 91(sp) -; RV32I-NEXT: sb a3, 90(sp) -; RV32I-NEXT: sb a4, 89(sp) +; RV32I-NEXT: sb a4, 91(sp) +; RV32I-NEXT: sb a5, 90(sp) +; RV32I-NEXT: sb t1, 89(sp) ; RV32I-NEXT: sb a0, 88(sp) -; RV32I-NEXT: sb a5, 87(sp) -; RV32I-NEXT: sb a6, 86(sp) -; RV32I-NEXT: sb a7, 85(sp) -; RV32I-NEXT: sb s10, 84(sp) -; RV32I-NEXT: sb ra, 83(sp) -; RV32I-NEXT: sb s11, 82(sp) -; RV32I-NEXT: sb s9, 81(sp) -; RV32I-NEXT: sb s8, 80(sp) -; RV32I-NEXT: sb s7, 79(sp) -; RV32I-NEXT: sb s6, 78(sp) -; RV32I-NEXT: sb s5, 77(sp) -; RV32I-NEXT: sb s4, 76(sp) +; RV32I-NEXT: sb a1, 87(sp) +; RV32I-NEXT: sb a3, 86(sp) ; RV32I-NEXT: sb zero, 59(sp) ; RV32I-NEXT: sb zero, 58(sp) ; RV32I-NEXT: sb zero, 57(sp) @@ -2544,6 +2548,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb zero, 30(sp) ; RV32I-NEXT: sb zero, 29(sp) ; RV32I-NEXT: sb zero, 28(sp) +; RV32I-NEXT: sb a6, 85(sp) +; RV32I-NEXT: sb ra, 84(sp) +; RV32I-NEXT: sb s11, 83(sp) +; RV32I-NEXT: sb s10, 82(sp) +; RV32I-NEXT: sb s9, 81(sp) +; RV32I-NEXT: sb s8, 80(sp) +; RV32I-NEXT: sb s7, 79(sp) +; RV32I-NEXT: sb s6, 78(sp) +; RV32I-NEXT: sb s5, 77(sp) +; RV32I-NEXT: sb s4, 76(sp) ; RV32I-NEXT: sb s3, 75(sp) ; RV32I-NEXT: sb s2, 74(sp) ; RV32I-NEXT: sb s1, 73(sp) @@ -2553,190 +2567,195 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb t4, 69(sp) ; RV32I-NEXT: sb t3, 68(sp) ; RV32I-NEXT: sb t2, 67(sp) -; RV32I-NEXT: sb t1, 66(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb t0, 66(sp) +; RV32I-NEXT: lw a0, 0(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 65(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 64(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 63(sp) +; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 62(sp) +; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 61(sp) +; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a0, 60(sp) +; RV32I-NEXT: slli a0, a7, 24 +; RV32I-NEXT: srli a0, a0, 27 +; RV32I-NEXT: addi a1, sp, 60 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a3, 7(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: sw a6, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: sw a6, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t1, 10(a0) +; RV32I-NEXT: lbu t2, 11(a0) +; RV32I-NEXT: lbu t3, 12(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu t5, 15(a0) +; RV32I-NEXT: lbu s0, 16(a0) +; RV32I-NEXT: lbu s1, 17(a0) +; RV32I-NEXT: lbu s2, 18(a0) +; RV32I-NEXT: lbu t0, 19(a0) +; RV32I-NEXT: lbu s5, 20(a0) +; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s8, 22(a0) +; RV32I-NEXT: lbu s9, 23(a0) +; RV32I-NEXT: lbu s3, 24(a0) +; RV32I-NEXT: lbu s6, 25(a0) +; RV32I-NEXT: lbu s4, 26(a0) +; RV32I-NEXT: lbu s7, 27(a0) +; RV32I-NEXT: lbu s11, 28(a0) +; RV32I-NEXT: lbu ra, 29(a0) +; RV32I-NEXT: lbu a6, 30(a0) +; RV32I-NEXT: lbu a7, 31(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a1 +; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a5, a1, a4 +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a4, a0, a1 +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or a0, t6, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or a1, t5, t4 +; RV32I-NEXT: or t3, a1, a0 ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 62(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: slli a0, t0, 24 -; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 60 -; RV32I-NEXT: sub a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: lw a1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a4) -; RV32I-NEXT: lbu a3, 0(a4) -; RV32I-NEXT: lbu a5, 2(a4) -; RV32I-NEXT: lbu a6, 3(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a5 -; RV32I-NEXT: or a6, a3, a0 -; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: xori a7, a1, 31 -; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a4) -; RV32I-NEXT: lbu a5, 12(a4) -; RV32I-NEXT: lbu t0, 14(a4) -; RV32I-NEXT: lbu t1, 15(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, t0 -; RV32I-NEXT: or t0, a5, a3 -; RV32I-NEXT: lbu a3, 9(a4) -; RV32I-NEXT: lbu a5, 8(a4) -; RV32I-NEXT: lbu t1, 10(a4) -; RV32I-NEXT: lbu t2, 11(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a5, t2, t1 -; RV32I-NEXT: or t1, a5, a3 -; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a5, a3, a7 -; RV32I-NEXT: srli t4, t5, 1 -; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu a3, 21(a4) -; RV32I-NEXT: lbu t3, 20(a4) -; RV32I-NEXT: lbu t6, 22(a4) -; RV32I-NEXT: lbu s0, 23(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t3, s0, t6 -; RV32I-NEXT: or t3, t3, a3 -; RV32I-NEXT: lbu a3, 17(a4) -; RV32I-NEXT: lbu t6, 16(a4) -; RV32I-NEXT: lbu s0, 18(a4) -; RV32I-NEXT: lbu s1, 19(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a3 -; RV32I-NEXT: lbu a3, 29(a4) -; RV32I-NEXT: lbu t6, 28(a4) -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu s2, 31(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: lbu s1, 25(a4) -; RV32I-NEXT: lbu s2, 24(a4) -; RV32I-NEXT: srl t4, t4, t2 -; RV32I-NEXT: or t6, t6, a3 +; RV32I-NEXT: or a1, t2, t1 +; RV32I-NEXT: or t1, a1, a0 +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or a0, s10, s5 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: or a1, s9, s8 +; RV32I-NEXT: or t2, a1, a0 ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s1, s2 -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu a4, 27(a4) -; RV32I-NEXT: srli s2, s0, 1 -; RV32I-NEXT: srl s2, s2, a7 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: srli s1, t0, 1 -; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a4, a4, a3 -; RV32I-NEXT: srli a3, a4, 1 -; RV32I-NEXT: srl a7, a3, a7 -; RV32I-NEXT: srli a3, t3, 1 -; RV32I-NEXT: srl t2, a3, t2 -; RV32I-NEXT: sll a3, t5, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t3, t3, a1 -; RV32I-NEXT: sll t5, s0, a1 -; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a4, a4, a1 -; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a4, 24 -; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a4, t2 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 25(a2) -; RV32I-NEXT: srli a4, t6, 24 -; RV32I-NEXT: sb a4, 31(a2) -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 30(a2) -; RV32I-NEXT: or a4, t6, a7 -; RV32I-NEXT: srli a7, t6, 8 -; RV32I-NEXT: sb a7, 29(a2) -; RV32I-NEXT: srli a7, t5, 24 -; RV32I-NEXT: sb a7, 19(a2) -; RV32I-NEXT: srli a7, t5, 16 -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: or a7, t5, s1 -; RV32I-NEXT: srli t2, t5, 8 -; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t3, 24 -; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t3, 16 -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t3, s2 +; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: andi t4, a0, 7 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a0, t0, s2 +; RV32I-NEXT: srli a1, a4, 1 +; RV32I-NEXT: or s0, a0, s0 +; RV32I-NEXT: xori a3, t4, 31 +; RV32I-NEXT: srl a0, a1, a3 +; RV32I-NEXT: slli ra, ra, 8 +; RV32I-NEXT: or t0, ra, s11 +; RV32I-NEXT: srli a1, t1, 1 +; RV32I-NEXT: srl a1, a1, a3 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: srli a7, a5, 1 +; RV32I-NEXT: or t0, a6, t0 +; RV32I-NEXT: not t5, t4 +; RV32I-NEXT: srl a6, a7, t5 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: or a7, s6, s3 +; RV32I-NEXT: srli t6, s0, 1 +; RV32I-NEXT: srl t6, t6, a3 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: or s1, s7, s4 +; RV32I-NEXT: srli s2, t3, 1 +; RV32I-NEXT: srl s2, s2, t5 +; RV32I-NEXT: or a7, s1, a7 +; RV32I-NEXT: srli s1, a7, 1 +; RV32I-NEXT: srl s1, s1, a3 +; RV32I-NEXT: srli a3, t2, 1 +; RV32I-NEXT: srl t5, a3, t5 +; RV32I-NEXT: sll a3, a5, t4 +; RV32I-NEXT: sll a5, t3, t4 +; RV32I-NEXT: sll t1, t1, t4 +; RV32I-NEXT: sll t2, t2, t4 +; RV32I-NEXT: sll t3, s0, t4 +; RV32I-NEXT: sll t0, t0, t4 +; RV32I-NEXT: sll a7, a7, t4 +; RV32I-NEXT: sll a4, a4, t4 +; RV32I-NEXT: srli t4, a7, 24 +; RV32I-NEXT: sb t4, 27(a2) +; RV32I-NEXT: srli t4, a7, 16 +; RV32I-NEXT: sb t4, 26(a2) +; RV32I-NEXT: or t4, a7, t5 +; RV32I-NEXT: srli a7, a7, 8 +; RV32I-NEXT: sb a7, 25(a2) +; RV32I-NEXT: srli a7, t0, 24 +; RV32I-NEXT: sb a7, 31(a2) +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: sb a7, 30(a2) +; RV32I-NEXT: or a7, t0, s1 +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: sb t0, 29(a2) +; RV32I-NEXT: srli t0, t3, 24 +; RV32I-NEXT: sb t0, 19(a2) +; RV32I-NEXT: srli t0, t3, 16 +; RV32I-NEXT: sb t0, 18(a2) +; RV32I-NEXT: or t0, t3, s2 ; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, t1, 24 -; RV32I-NEXT: sb t3, 11(a2) -; RV32I-NEXT: srli t3, t1, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, t1, t4 +; RV32I-NEXT: sb t3, 17(a2) +; RV32I-NEXT: srli t3, t2, 24 +; RV32I-NEXT: sb t3, 23(a2) +; RV32I-NEXT: srli t3, t2, 16 +; RV32I-NEXT: sb t3, 22(a2) +; RV32I-NEXT: or t3, t2, t6 +; RV32I-NEXT: srli t2, t2, 8 +; RV32I-NEXT: sb t2, 21(a2) +; RV32I-NEXT: srli t2, t1, 24 +; RV32I-NEXT: sb t2, 11(a2) +; RV32I-NEXT: srli t2, t1, 16 +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: or a6, t1, a6 ; RV32I-NEXT: srli t1, t1, 8 ; RV32I-NEXT: sb t1, 9(a2) -; RV32I-NEXT: srli t1, t0, 24 +; RV32I-NEXT: srli t1, a5, 24 ; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli t1, t0, 16 +; RV32I-NEXT: srli t1, a5, 16 ; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: srli t0, t0, 8 -; RV32I-NEXT: sb t0, 13(a2) -; RV32I-NEXT: srli t0, a1, 24 -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: srli t0, a1, 16 -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 13(a2) +; RV32I-NEXT: srli a5, a4, 24 +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 2(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 7(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 6(a2) ; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb t3, 8(a2) -; RV32I-NEXT: sb a5, 12(a2) +; RV32I-NEXT: sb t4, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a1, 12(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2776,19 +2795,43 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 31(a0) -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a3, 29(a0) ; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) +; RV64I-NEXT: lbu a3, 30(a0) ; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu t0, 3(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 5(a1) +; RV64I-NEXT: lbu t3, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: slli t2, t2, 8 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 2(a0) +; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu a1, 3(a0) +; RV64I-NEXT: sd a1, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 5(a0) ; RV64I-NEXT: lbu t3, 6(a0) ; RV64I-NEXT: lbu t4, 7(a0) ; RV64I-NEXT: lbu t5, 8(a0) @@ -2803,50 +2846,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s7, 17(a0) ; RV64I-NEXT: lbu s8, 18(a0) ; RV64I-NEXT: lbu s9, 19(a0) -; RV64I-NEXT: lbu a3, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or a3, s11, a3 -; RV64I-NEXT: lbu s11, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 20(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 21(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t2, a1, a3 -; RV64I-NEXT: lbu t0, 23(a0) -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu a6, 25(a0) -; RV64I-NEXT: lbu a5, 26(a0) -; RV64I-NEXT: lbu a1, 30(a0) -; RV64I-NEXT: lbu a3, 29(a0) -; RV64I-NEXT: lbu a4, 28(a0) -; RV64I-NEXT: lbu a0, 27(a0) -; RV64I-NEXT: sb a1, 86(sp) -; RV64I-NEXT: sb a3, 85(sp) -; RV64I-NEXT: sb a4, 84(sp) -; RV64I-NEXT: sb a0, 83(sp) -; RV64I-NEXT: sb a5, 82(sp) -; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb t0, 79(sp) -; RV64I-NEXT: sb s10, 78(sp) -; RV64I-NEXT: sb ra, 77(sp) -; RV64I-NEXT: sb s11, 76(sp) +; RV64I-NEXT: lbu s10, 20(a0) +; RV64I-NEXT: lbu s11, 21(a0) +; RV64I-NEXT: lbu ra, 22(a0) +; RV64I-NEXT: lbu a7, 23(a0) +; RV64I-NEXT: lbu a6, 24(a0) +; RV64I-NEXT: lbu a5, 25(a0) +; RV64I-NEXT: lbu a4, 26(a0) +; RV64I-NEXT: lbu a3, 27(a0) +; RV64I-NEXT: lbu a1, 28(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: ld t0, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb t0, 86(sp) +; RV64I-NEXT: ld t0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb t0, 85(sp) +; RV64I-NEXT: sb a1, 84(sp) +; RV64I-NEXT: sb a3, 83(sp) +; RV64I-NEXT: sb a4, 82(sp) +; RV64I-NEXT: sb a5, 81(sp) +; RV64I-NEXT: sb a6, 80(sp) +; RV64I-NEXT: sb a0, 87(sp) +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: sb a7, 79(sp) +; RV64I-NEXT: sb ra, 78(sp) +; RV64I-NEXT: sb s11, 77(sp) +; RV64I-NEXT: sb s10, 76(sp) ; RV64I-NEXT: sb s9, 75(sp) ; RV64I-NEXT: sb s8, 74(sp) ; RV64I-NEXT: sb s7, 73(sp) @@ -2859,23 +2883,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s0, 66(sp) ; RV64I-NEXT: sb t6, 65(sp) ; RV64I-NEXT: sb t5, 64(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 ; RV64I-NEXT: sb t4, 63(sp) ; RV64I-NEXT: sb t3, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 +; RV64I-NEXT: sb t2, 61(sp) +; RV64I-NEXT: sb t1, 60(sp) +; RV64I-NEXT: ld a1, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 59(sp) +; RV64I-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 58(sp) +; RV64I-NEXT: ld a1, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 57(sp) +; RV64I-NEXT: ld a1, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: sb a1, 56(sp) +; RV64I-NEXT: srai a0, a0, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2915,108 +2935,109 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: slli a0, t2, 56 +; RV64I-NEXT: ld ra, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: slli a0, ra, 56 ; RV64I-NEXT: srli a0, a0, 59 ; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: lbu a0, 9(a1) -; RV64I-NEXT: lbu a3, 8(a1) -; RV64I-NEXT: lbu a4, 10(a1) -; RV64I-NEXT: lbu a5, 11(a1) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a3, 13(a1) -; RV64I-NEXT: lbu a4, 12(a1) -; RV64I-NEXT: lbu a5, 14(a1) -; RV64I-NEXT: lbu a6, 15(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a3, 8(a0) +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) +; RV64I-NEXT: lbu a7, 12(a0) +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu t2, 15(a0) +; RV64I-NEXT: lbu t3, 16(a0) +; RV64I-NEXT: lbu t4, 17(a0) +; RV64I-NEXT: lbu t5, 18(a0) +; RV64I-NEXT: lbu t6, 19(a0) +; RV64I-NEXT: lbu s0, 20(a0) +; RV64I-NEXT: lbu s1, 21(a0) +; RV64I-NEXT: lbu s2, 22(a0) +; RV64I-NEXT: lbu s3, 23(a0) +; RV64I-NEXT: lbu s4, 24(a0) +; RV64I-NEXT: lbu s5, 25(a0) +; RV64I-NEXT: lbu s6, 26(a0) +; RV64I-NEXT: lbu s7, 27(a0) +; RV64I-NEXT: lbu s8, 28(a0) +; RV64I-NEXT: lbu s9, 29(a0) +; RV64I-NEXT: lbu s10, 30(a0) +; RV64I-NEXT: lbu s11, 31(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a1, a6, a5 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a3, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a4, t2, t1 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a4, a3, a0 -; RV64I-NEXT: andi a3, t2, 7 -; RV64I-NEXT: lbu a0, 17(a1) -; RV64I-NEXT: lbu a5, 16(a1) -; RV64I-NEXT: lbu a6, 18(a1) -; RV64I-NEXT: lbu a7, 19(a1) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a1) -; RV64I-NEXT: lbu a6, 20(a1) -; RV64I-NEXT: lbu a7, 22(a1) -; RV64I-NEXT: lbu t0, 23(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a4, a3, a1 +; RV64I-NEXT: andi a3, ra, 7 +; RV64I-NEXT: slli t4, t4, 8 +; RV64I-NEXT: or a1, t4, t3 +; RV64I-NEXT: slli t5, t5, 16 +; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a5, t6, t5 +; RV64I-NEXT: or a1, a5, a1 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a5, s3, s2 +; RV64I-NEXT: or a5, a5, s0 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 +; RV64I-NEXT: or a5, a5, a1 +; RV64I-NEXT: slli a1, a5, 1 ; RV64I-NEXT: not a6, a3 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: sll a1, a1, a6 +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu t2, 4(a0) +; RV64I-NEXT: lbu t3, 5(a0) +; RV64I-NEXT: lbu t4, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 6(a1) -; RV64I-NEXT: lbu t2, 7(a1) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a1) -; RV64I-NEXT: lbu t0, 24(a1) -; RV64I-NEXT: lbu t1, 26(a1) -; RV64I-NEXT: lbu t2, 27(a1) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a1) -; RV64I-NEXT: lbu t1, 28(a1) -; RV64I-NEXT: lbu t2, 30(a1) -; RV64I-NEXT: lbu a1, 31(a1) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or t0, t0, t1 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: slli t3, t3, 8 +; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t4 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a6, a0, a6 +; RV64I-NEXT: slli s5, s5, 8 +; RV64I-NEXT: or a0, s5, s4 +; RV64I-NEXT: slli s6, s6, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or a7, s7, s6 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or a7, s9, s8 +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: or t0, s11, s10 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: xori t0, a3, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a7, a1, a7 -; RV64I-NEXT: slli a1, a7, 1 -; RV64I-NEXT: sll t0, a1, t0 -; RV64I-NEXT: srl a1, a4, a3 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: or a7, a7, a0 +; RV64I-NEXT: slli a0, a7, 1 +; RV64I-NEXT: sll t0, a0, t0 +; RV64I-NEXT: srl a0, a4, a3 ; RV64I-NEXT: srl a4, a6, a3 ; RV64I-NEXT: srl a5, a5, a3 ; RV64I-NEXT: sra a3, a7, a3 @@ -3063,26 +3084,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: srli a4, a1, 48 +; RV64I-NEXT: srli a4, a0, 48 ; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: srli a4, a1, 40 +; RV64I-NEXT: srli a4, a0, 40 ; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: srli a4, a1, 32 +; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: sb a4, 12(a2) -; RV64I-NEXT: srli a4, a1, 24 +; RV64I-NEXT: srli a4, a0, 24 ; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: srli a4, a1, 16 +; RV64I-NEXT: srli a4, a0, 16 ; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: srli a1, a6, 56 -; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: srli a0, a6, 56 +; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: srli a3, a3, 56 ; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) +; RV64I-NEXT: srli a1, a1, 56 +; RV64I-NEXT: sb a1, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -3115,94 +3136,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t3, 31(a0) -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t2, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: lbu ra, 2(a1) +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu t0, 29(a0) +; RV32I-NEXT: lbu a7, 30(a0) +; RV32I-NEXT: lbu a5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or a3, a3, s11 -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: or t1, a1, a3 -; RV32I-NEXT: lbu t0, 23(a0) -; RV32I-NEXT: lbu a7, 24(a0) -; RV32I-NEXT: lbu a6, 25(a0) -; RV32I-NEXT: lbu a5, 26(a0) -; RV32I-NEXT: lbu a1, 30(a0) -; RV32I-NEXT: lbu a3, 29(a0) -; RV32I-NEXT: lbu a4, 28(a0) -; RV32I-NEXT: lbu a0, 27(a0) -; RV32I-NEXT: sb a1, 58(sp) -; RV32I-NEXT: sb a3, 57(sp) -; RV32I-NEXT: sb a4, 56(sp) -; RV32I-NEXT: sb a0, 55(sp) -; RV32I-NEXT: sb a5, 54(sp) -; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb a7, 52(sp) -; RV32I-NEXT: sb t0, 51(sp) -; RV32I-NEXT: sb ra, 50(sp) -; RV32I-NEXT: sb s11, 49(sp) -; RV32I-NEXT: sb s10, 48(sp) -; RV32I-NEXT: sb s9, 47(sp) -; RV32I-NEXT: sb s8, 46(sp) -; RV32I-NEXT: sb s7, 45(sp) -; RV32I-NEXT: sb s6, 44(sp) -; RV32I-NEXT: sb s5, 43(sp) -; RV32I-NEXT: sb t3, 59(sp) -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: sb s4, 42(sp) -; RV32I-NEXT: sb s3, 41(sp) -; RV32I-NEXT: sb s2, 40(sp) -; RV32I-NEXT: sb s1, 39(sp) -; RV32I-NEXT: sb s0, 38(sp) -; RV32I-NEXT: sb t6, 37(sp) -; RV32I-NEXT: sb t5, 36(sp) -; RV32I-NEXT: sb t4, 35(sp) +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 3(a0) +; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t1, 5(a0) +; RV32I-NEXT: lbu t2, 6(a0) +; RV32I-NEXT: lbu t3, 7(a0) +; RV32I-NEXT: lbu t4, 8(a0) +; RV32I-NEXT: lbu t5, 9(a0) +; RV32I-NEXT: lbu t6, 10(a0) +; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu s1, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s3, 14(a0) +; RV32I-NEXT: lbu s4, 15(a0) +; RV32I-NEXT: lbu s5, 16(a0) +; RV32I-NEXT: lbu s6, 17(a0) +; RV32I-NEXT: lbu s7, 18(a0) +; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: lbu s9, 20(a0) +; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s11, 22(a0) +; RV32I-NEXT: lbu ra, 23(a0) +; RV32I-NEXT: lbu a6, 24(a0) +; RV32I-NEXT: lbu a5, 25(a0) +; RV32I-NEXT: lbu a4, 26(a0) +; RV32I-NEXT: lbu a3, 27(a0) +; RV32I-NEXT: lbu a1, 28(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: sb a7, 58(sp) +; RV32I-NEXT: sb t0, 57(sp) +; RV32I-NEXT: sb a1, 56(sp) +; RV32I-NEXT: sb a3, 55(sp) +; RV32I-NEXT: sb a4, 54(sp) +; RV32I-NEXT: sb a5, 53(sp) +; RV32I-NEXT: sb a0, 59(sp) +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: sb a6, 52(sp) +; RV32I-NEXT: sb ra, 51(sp) +; RV32I-NEXT: sb s11, 50(sp) +; RV32I-NEXT: sb s10, 49(sp) +; RV32I-NEXT: sb s9, 48(sp) +; RV32I-NEXT: sb s8, 47(sp) +; RV32I-NEXT: sb s7, 46(sp) +; RV32I-NEXT: sb s6, 45(sp) +; RV32I-NEXT: sb s5, 44(sp) +; RV32I-NEXT: sb s4, 43(sp) +; RV32I-NEXT: sb s3, 42(sp) +; RV32I-NEXT: sb s2, 41(sp) +; RV32I-NEXT: sb s1, 40(sp) +; RV32I-NEXT: sb s0, 39(sp) +; RV32I-NEXT: sb t6, 38(sp) +; RV32I-NEXT: sb t5, 37(sp) +; RV32I-NEXT: sb t4, 36(sp) +; RV32I-NEXT: sb t3, 35(sp) ; RV32I-NEXT: sb t2, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t3, 31 +; RV32I-NEXT: sb t1, 33(sp) +; RV32I-NEXT: lw a1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 32(sp) +; RV32I-NEXT: lw a1, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 31(sp) +; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 30(sp) +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 29(sp) +; RV32I-NEXT: lw a1, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb a1, 28(sp) +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: sb a0, 88(sp) ; RV32I-NEXT: sb a0, 84(sp) ; RV32I-NEXT: sb a0, 80(sp) @@ -3238,175 +3258,181 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 63(sp) ; RV32I-NEXT: sb a3, 62(sp) ; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: slli a0, t1, 24 +; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a3, t1, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) -; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: addi a1, sp, 28 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: lbu a1, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a7, 6(a0) +; RV32I-NEXT: lbu a3, 7(a0) +; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 8(a0) +; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu t0, 9(a0) +; RV32I-NEXT: lbu t1, 10(a0) +; RV32I-NEXT: lbu t2, 11(a0) +; RV32I-NEXT: lbu t3, 12(a0) +; RV32I-NEXT: lbu t4, 13(a0) +; RV32I-NEXT: lbu t5, 14(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu s0, 16(a0) +; RV32I-NEXT: lbu s1, 17(a0) +; RV32I-NEXT: lbu s2, 18(a0) +; RV32I-NEXT: lbu s3, 19(a0) +; RV32I-NEXT: lbu s4, 20(a0) +; RV32I-NEXT: lbu s6, 21(a0) +; RV32I-NEXT: lbu s7, 22(a0) +; RV32I-NEXT: lbu s8, 23(a0) +; RV32I-NEXT: lbu s5, 24(a0) +; RV32I-NEXT: lbu s9, 25(a0) +; RV32I-NEXT: lbu s10, 26(a0) +; RV32I-NEXT: lbu s11, 27(a0) +; RV32I-NEXT: lbu ra, 28(a0) +; RV32I-NEXT: lbu a5, 29(a0) +; RV32I-NEXT: lbu a6, 30(a0) +; RV32I-NEXT: lbu a3, 31(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a1 +; RV32I-NEXT: slli a1, a7, 16 +; RV32I-NEXT: lw a7, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a4, a1, a4 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, t0, a1 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu t1, 0(a0) +; RV32I-NEXT: lbu t2, 1(a0) +; RV32I-NEXT: or t0, a7, a1 +; RV32I-NEXT: lbu a1, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 -; RV32I-NEXT: or a6, a1, a0 -; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: or a1, a0, a7 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or a0, t4, t3 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or a7, t6, t5 +; RV32I-NEXT: or t2, a7, a0 +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: slli s3, s3, 24 +; RV32I-NEXT: or a0, s3, s2 +; RV32I-NEXT: or s0, a0, s0 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: or a0, s6, s4 +; RV32I-NEXT: lw a7, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: andi t3, a7, 7 +; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli s8, s8, 24 +; RV32I-NEXT: or a7, s8, s7 +; RV32I-NEXT: slli t1, t0, 1 +; RV32I-NEXT: or t4, a7, a0 +; RV32I-NEXT: not a7, t3 +; RV32I-NEXT: sll a0, t1, a7 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: or t5, s9, s5 +; RV32I-NEXT: slli t6, a4, 1 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s1, s11, s10 +; RV32I-NEXT: slli t1, s0, 1 +; RV32I-NEXT: sll t1, t1, a7 +; RV32I-NEXT: or t5, s1, t5 +; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: sll s1, s1, a7 +; RV32I-NEXT: xori s2, t3, 31 +; RV32I-NEXT: sll a7, t6, s2 ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) -; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 -; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: sra a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 +; RV32I-NEXT: or a5, a5, ra +; RV32I-NEXT: slli t6, t2, 1 +; RV32I-NEXT: sll t6, t6, s2 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: slli a6, t4, 1 +; RV32I-NEXT: sll a6, a6, s2 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a5, a3, 1 +; RV32I-NEXT: sll a5, a5, s2 +; RV32I-NEXT: srl a4, a4, t3 +; RV32I-NEXT: srl a1, a1, t3 +; RV32I-NEXT: srl t2, t2, t3 +; RV32I-NEXT: srl t0, t0, t3 +; RV32I-NEXT: srl t4, t4, t3 +; RV32I-NEXT: srl s0, s0, t3 +; RV32I-NEXT: srl t5, t5, t3 +; RV32I-NEXT: sra a3, a3, t3 +; RV32I-NEXT: srli t3, t5, 16 +; RV32I-NEXT: sb t3, 26(a2) +; RV32I-NEXT: or a5, t5, a5 ; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) +; RV32I-NEXT: srli t3, t5, 8 +; RV32I-NEXT: sb t3, 25(a2) +; RV32I-NEXT: srli t3, a3, 24 +; RV32I-NEXT: sb t3, 31(a2) +; RV32I-NEXT: srli t3, a3, 16 +; RV32I-NEXT: sb t3, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, t4, 16 +; RV32I-NEXT: srli a3, s0, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or a3, t4, s1 -; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 -; RV32I-NEXT: sb t3, 20(a2) -; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: or a3, s0, a6 +; RV32I-NEXT: sb s0, 16(a2) +; RV32I-NEXT: srli s0, s0, 8 +; RV32I-NEXT: sb s0, 17(a2) +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: or a6, t4, s1 +; RV32I-NEXT: sb t4, 20(a2) +; RV32I-NEXT: srli t3, t4, 8 ; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: srli t3, t0, 16 ; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or t3, t0, t6 +; RV32I-NEXT: sb t0, 8(a2) +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: sb t0, 9(a2) +; RV32I-NEXT: srli t0, t2, 16 +; RV32I-NEXT: sb t0, 14(a2) +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: sb t2, 12(a2) +; RV32I-NEXT: srli t1, t2, 8 +; RV32I-NEXT: sb t1, 13(a2) +; RV32I-NEXT: srli t1, a1, 16 +; RV32I-NEXT: sb t1, 2(a2) +; RV32I-NEXT: or a7, a1, a7 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a4, 16 +; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: sb a4, 4(a2) ; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 27(a2) ; RV32I-NEXT: srli a3, a3, 24 ; RV32I-NEXT: sb a3, 19(a2) -; RV32I-NEXT: srli a3, t1, 24 -; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 -; RV32I-NEXT: sb a3, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: srli a1, a6, 24 +; RV32I-NEXT: sb a1, 23(a2) +; RV32I-NEXT: srli a1, t3, 24 +; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: srli a1, t0, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a1, a7, 24 ; RV32I-NEXT: sb a1, 3(a2) ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: sb a0, 7(a2) diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll index 34900b3006915..b51903765d114 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll @@ -57,14 +57,14 @@ define i64 @lwud(i32* %a) { define i64 @ldd(i64* %a) { ; RV32XTHEADMEMPAIR-LABEL: ldd: ; RV32XTHEADMEMPAIR: # %bb.0: -; RV32XTHEADMEMPAIR-NEXT: lw a1, 32(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a2, 36(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a3, 44(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a0, 40(a0) -; RV32XTHEADMEMPAIR-NEXT: add a2, a2, a3 -; RV32XTHEADMEMPAIR-NEXT: add a0, a1, a0 -; RV32XTHEADMEMPAIR-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMPAIR-NEXT: add a1, a2, a1 +; RV32XTHEADMEMPAIR-NEXT: lw a1, 40(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a2, 32(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a3, 36(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a4, 44(a0) +; RV32XTHEADMEMPAIR-NEXT: add a0, a2, a1 +; RV32XTHEADMEMPAIR-NEXT: sltu a1, a0, a2 +; RV32XTHEADMEMPAIR-NEXT: add a3, a3, a4 +; RV32XTHEADMEMPAIR-NEXT: add a1, a3, a1 ; RV32XTHEADMEMPAIR-NEXT: ret ; ; RV64XTHEADMEMPAIR-LABEL: ldd: