diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index fe4a3d7c551bd..486508e760dc5 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -304,10 +304,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs { /// The bottom of the unscheduled zone. MachineBasicBlock::iterator CurrentBottom; - /// Record the next node in a scheduled cluster. - const SUnit *NextClusterPred = nullptr; - const SUnit *NextClusterSucc = nullptr; - #if LLVM_ENABLE_ABI_BREAKING_CHECKS /// The number of instructions scheduled so far. Used to cut off the /// scheduler at the point determined by misched-cutoff. @@ -368,10 +364,6 @@ class LLVM_ABI ScheduleDAGMI : public ScheduleDAGInstrs { /// live ranges and region boundary iterators. void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos); - const SUnit *getNextClusterPred() const { return NextClusterPred; } - - const SUnit *getNextClusterSucc() const { return NextClusterSucc; } - void viewGraph(const Twine &Name, const Twine &Title) override; void viewGraph() override; @@ -1295,6 +1287,9 @@ class LLVM_ABI GenericScheduler : public GenericSchedulerBase { SchedBoundary Top; SchedBoundary Bot; + ClusterInfo *TopCluster; + ClusterInfo *BotCluster; + /// Candidate last picked from Top boundary. SchedCandidate TopCand; /// Candidate last picked from Bot boundary. @@ -1335,6 +1330,9 @@ class LLVM_ABI PostGenericScheduler : public GenericSchedulerBase { /// Candidate last picked from Bot boundary. SchedCandidate BotCand; + ClusterInfo *TopCluster; + ClusterInfo *BotCluster; + public: PostGenericScheduler(const MachineSchedContext *C) : GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"), diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h index 7a49e9ffedf8a..3a0a31b1930b0 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAG.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h @@ -17,6 +17,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/CodeGen/MachineInstr.h" @@ -235,6 +236,10 @@ class TargetRegisterInfo; LLVM_ABI void dump(const TargetRegisterInfo *TRI = nullptr) const; }; + /// Keep record of which SUnit are in the same cluster group. + typedef SmallSet ClusterInfo; + constexpr unsigned InvalidClusterId = ~0u; + /// Scheduling unit. This is a node in the scheduling DAG. class SUnit { private: @@ -275,6 +280,8 @@ class TargetRegisterInfo; unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready. unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready. + unsigned ParentClusterIdx = InvalidClusterId; ///< The parent cluster id. + private: unsigned Depth = 0; ///< Node depth. unsigned Height = 0; ///< Node height. diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index 7f7b3036af803..ba5b2da64fd80 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -181,6 +181,8 @@ namespace llvm { /// case of a huge region that gets reduced). SUnit *BarrierChain = nullptr; + SmallVector Clusters; + public: /// A list of SUnits, used in Value2SUsMap, during DAG construction. /// Note: to gain speed it might be worth investigating an optimized @@ -384,6 +386,14 @@ namespace llvm { /// equivalent edge already existed (false indicates failure). bool addEdge(SUnit *SuccSU, const SDep &PredDep); + /// Returns the array of the clusters. + SmallVector &getClusters() { return Clusters; } + + /// Get the specific cluster, return nullptr for InvalidClusterId. + ClusterInfo *getCluster(unsigned Idx) { + return Idx != InvalidClusterId ? &Clusters[Idx] : nullptr; + } + protected: void initSUnits(); void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 2cc7f549a14d1..7b4f29e82fe6f 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -943,8 +944,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { if (SuccEdge->isWeak()) { --SuccSU->WeakPredsLeft; - if (SuccEdge->isCluster()) - NextClusterSucc = SuccSU; return; } #ifndef NDEBUG @@ -967,12 +966,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { /// releaseSuccessors - Call releaseSucc on each of SU's successors. void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { - // Reset the next successor, For example, we want to cluster A B C. - // After A is picked, we will set B as next cluster succ, but if we pick - // D instead of B after A, then we need to reset the next cluster succ because - // we have decided to not pick the cluster candidate B during pickNode(). - // Leaving B as the NextClusterSucc just make things messy. - NextClusterSucc = nullptr; for (SDep &Succ : SU->Succs) releaseSucc(SU, &Succ); } @@ -986,8 +979,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { if (PredEdge->isWeak()) { --PredSU->WeakSuccsLeft; - if (PredEdge->isCluster()) - NextClusterPred = PredSU; return; } #ifndef NDEBUG @@ -1010,7 +1001,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { /// releasePredecessors - Call releasePred on each of SU's predecessors. void ScheduleDAGMI::releasePredecessors(SUnit *SU) { - NextClusterPred = nullptr; for (SDep &Pred : SU->Preds) releasePred(SU, &Pred); } @@ -1184,11 +1174,8 @@ findRootsAndBiasEdges(SmallVectorImpl &TopRoots, } /// Identify DAG roots and setup scheduler queues. -void ScheduleDAGMI::initQueues(ArrayRef TopRoots, - ArrayRef BotRoots) { - NextClusterSucc = nullptr; - NextClusterPred = nullptr; - +void ScheduleDAGMI::initQueues(ArrayRef TopRoots, + ArrayRef BotRoots) { // Release all DAG roots for scheduling, not including EntrySU/ExitSU. // // Nodes with unreleased weak edges can still be roots. @@ -2116,6 +2103,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( ScheduleDAGInstrs *DAG) { // Keep track of the current cluster length and bytes for each SUnit. DenseMap> SUnit2ClusterInfo; + EquivalenceClasses Clusters; // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. @@ -2155,6 +2143,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; + if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); @@ -2162,6 +2151,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) continue; + Clusters.unionSets(SUa, SUb); LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); ++NumClustered; @@ -2201,6 +2191,21 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps( << ", Curr cluster bytes: " << CurrentClusterBytes << "\n"); } + + // Add cluster group information. + // Iterate over all of the equivalence sets. + auto &AllClusters = DAG->getClusters(); + for (const EquivalenceClasses::ECValue *I : Clusters) { + if (!I->isLeader()) + continue; + ClusterInfo Group; + unsigned ClusterIdx = AllClusters.size(); + for (SUnit *MemberI : Clusters.members(*I)) { + MemberI->ParentClusterIdx = ClusterIdx; + Group.insert(MemberI); + } + AllClusters.push_back(Group); + } } void BaseMemOpClusterMutation::collectMemOpRecords( @@ -3688,6 +3693,9 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) { } TopCand.SU = nullptr; BotCand.SU = nullptr; + + TopCluster = nullptr; + BotCluster = nullptr; } /// Initialize the per-region scheduling policy. @@ -3997,13 +4005,11 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, - TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; if (SameBoundary) { @@ -4262,11 +4268,25 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); + TopCluster = DAG->getCluster(SU->ParentClusterIdx); + LLVM_DEBUG(if (TopCluster) { + dbgs() << " Top Cluster: "; + for (auto *N : *TopCluster) + dbgs() << N->NodeNum << '\t'; + dbgs() << '\n'; + }); Top.bumpNode(SU); if (SU->hasPhysRegUses) reschedulePhysReg(SU, true); } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); + BotCluster = DAG->getCluster(SU->ParentClusterIdx); + LLVM_DEBUG(if (BotCluster) { + dbgs() << " Bot Cluster: "; + for (auto *N : *BotCluster) + dbgs() << N->NodeNum << '\t'; + dbgs() << '\n'; + }); Bot.bumpNode(SU); if (SU->hasPhysRegDefs) reschedulePhysReg(SU, false); @@ -4303,6 +4323,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { if (!Bot.HazardRec) { Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); } + TopCluster = nullptr; + BotCluster = nullptr; } void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, @@ -4367,14 +4389,12 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; // Keep clustered nodes together. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; - // Avoid critical resource consumption and balance the schedule. if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, TryCand, Cand, ResourceReduce)) @@ -4571,9 +4591,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); + TopCluster = DAG->getCluster(SU->ParentClusterIdx); Top.bumpNode(SU); } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); + BotCluster = DAG->getCluster(SU->ParentClusterIdx); Bot.bumpNode(SU); } } diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp index 5bd6ca0978a4b..71053342f468e 100644 --- a/llvm/lib/CodeGen/MacroFusion.cpp +++ b/llvm/lib/CodeGen/MacroFusion.cpp @@ -61,6 +61,11 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU, for (SDep &SI : SecondSU.Preds) if (SI.isCluster()) return false; + + unsigned FirstCluster = FirstSU.ParentClusterIdx; + unsigned SecondCluster = SecondSU.ParentClusterIdx; + assert(FirstCluster == InvalidClusterId && SecondCluster == InvalidClusterId); + // Though the reachability checks above could be made more generic, // perhaps as part of ScheduleDAGInstrs::addEdge(), since such edges are valid, // the extra computation cost makes it less interesting in general cases. @@ -70,6 +75,14 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU, if (!DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster))) return false; + auto &Clusters = DAG.getClusters(); + + FirstSU.ParentClusterIdx = Clusters.size(); + SecondSU.ParentClusterIdx = Clusters.size(); + + SmallSet Cluster{{&FirstSU, &SecondSU}}; + Clusters.push_back(Cluster); + // TODO - If we want to chain more than two instructions, we need to create // artifical edges to make dependencies from the FirstSU also dependent // on other chained instructions, and other chained instructions also diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp index 26857edd871e2..e630b80e33ab4 100644 --- a/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -365,6 +365,9 @@ LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeName(const SUnit &SU) const { LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeAll(const SUnit &SU) const { dumpNode(SU); SU.dumpAttributes(); + if (SU.ParentClusterIdx != InvalidClusterId) + dbgs() << " Parent Cluster Index: " << SU.ParentClusterIdx << '\n'; + if (SU.Preds.size() > 0) { dbgs() << " Predecessors:\n"; for (const SDep &Dep : SU.Preds) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 706ae92c9e47c..0f80462050cda 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -589,12 +589,11 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; // Avoid increasing the max critical pressure in the scheduled region. @@ -664,12 +663,11 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, // MaxMemoryClause-specific: We prioritize clustered instructions as we would // get more benefit from clausing these memory instructions. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; // We only compare a subset of features when comparing nodes between diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp index 03712879f7c49..5eb1f0128643d 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -100,12 +100,11 @@ bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; if (SameBoundary) { @@ -190,8 +189,11 @@ bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; // Keep clustered nodes together. - if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), - Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster)) + const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; + const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; + if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), + CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + Cluster)) return TryCand.Reason != NoCand; // Avoid critical resource consumption and balance the schedule. diff --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll index f364429b86c38..6019a62f4925e 100644 --- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -581,9 +581,8 @@ define void @callee_in_memory(%T_IN_MEMORY %a) { ; CHECK-SD-NEXT: add x8, x8, :lo12:in_memory_store ; CHECK-SD-NEXT: ldr d0, [sp, #64] ; CHECK-SD-NEXT: str d0, [x8, #64] -; CHECK-SD-NEXT: ldr q0, [sp, #16] ; CHECK-SD-NEXT: str q2, [x8, #48] -; CHECK-SD-NEXT: ldr q2, [sp] +; CHECK-SD-NEXT: ldp q2, q0, [sp] ; CHECK-SD-NEXT: stp q0, q1, [x8, #16] ; CHECK-SD-NEXT: str q2, [x8] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll index 7e72e8de01f4f..3bada9d5b3bb4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: @test ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3 -; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], [[[BASE]]] -; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], [[[BASE]], #64] +; CHECK-DAG: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], [[[BASE]]] +; CHECK-DAG: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], [[[BASE]], #64] ; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] ; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] ; CHECK: ret @@ -36,8 +36,8 @@ entry: ; CHECK-LABEL: @test_int ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3 -; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], [[[BASE]]] -; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], [[[BASE]], #64] +; CHECK-DAG: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], [[[BASE]]] +; CHECK-DAG: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], [[[BASE]], #64] ; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] ; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] ; CHECK: ret @@ -65,8 +65,8 @@ entry: ; CHECK-LABEL: @test_long ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4 -; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], [[[BASE]]] -; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], [[[BASE]], #128] +; CHECK-DAG: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], [[[BASE]]] +; CHECK-DAG: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], [[[BASE]], #128] ; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] ; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] ; CHECK: ret diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll index fee52ead98962..e70ddc3415cac 100644 --- a/llvm/test/CodeGen/AArch64/bcmp.ll +++ b/llvm/test/CodeGen/AArch64/bcmp.ll @@ -494,13 +494,14 @@ define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) ; CHECK-LABEL: bcmp_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x2, x0 -; CHECK-NEXT: ldp x8, x10, [sp] +; CHECK-NEXT: ldp x10, x8, [sp, #8] ; CHECK-NEXT: ccmp x3, x1, #0, eq -; CHECK-NEXT: ldp x9, x11, [sp, #16] +; CHECK-NEXT: ldr x9, [sp] +; CHECK-NEXT: ldr x11, [sp, #24] ; CHECK-NEXT: ccmp x6, x4, #0, eq ; CHECK-NEXT: ccmp x7, x5, #0, eq ; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: ccmp x11, x10, #0, eq ; CHECK-NEXT: csinc w0, w12, wzr, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index 75f3ffc9515e5..cabb0e7278e40 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -861,7 +861,7 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s3, [x2] +; CHECK-NEXT: ldp s0, s5, [x2] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x9, x3, #12 ; CHECK-NEXT: add x10, x1, #8 @@ -871,26 +871,26 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x1] ; CHECK-NEXT: ldp s7, s4, [x0, #8] -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ldp s6, s5, [x2, #8] +; CHECK-NEXT: ld1 { v5.s }[1], [x3] +; CHECK-NEXT: ldp s6, s3, [x2, #8] ; CHECK-NEXT: ld1 { v4.s }[1], [x11] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: uaddl v2.8h, v2.8b, v4.8b ; CHECK-NEXT: uaddl v1.8h, v1.8b, v7.8b ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: uaddl v3.8h, v3.8b, v5.8b +; CHECK-NEXT: uaddl v5.8h, v5.8b, v3.8b ; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll v16.8h, v3.8b, #0 ; CHECK-NEXT: ushll v0.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 -; CHECK-NEXT: stp q4, q5, [x4] +; CHECK-NEXT: ushll v7.4s, v5.4h, #3 +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3 +; CHECK-NEXT: stp q4, q16, [x4] ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v6.8h +; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v6.8h ; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p @@ -960,7 +960,7 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_add: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s4, [x2] +; CHECK-NEXT: ldp s0, s5, [x2] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x9, x3, #12 ; CHECK-NEXT: add x10, x1, #8 @@ -970,15 +970,15 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x1] ; CHECK-NEXT: ldp s7, s3, [x0, #8] -; CHECK-NEXT: ld1 { v4.s }[1], [x3] -; CHECK-NEXT: ldp s6, s5, [x2, #8] +; CHECK-NEXT: ld1 { v5.s }[1], [x3] +; CHECK-NEXT: ldp s6, s4, [x2, #8] ; CHECK-NEXT: ld1 { v3.s }[1], [x11] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: uaddl v16.8h, v2.8b, v3.8b ; CHECK-NEXT: uaddl v1.8h, v1.8b, v7.8b -; CHECK-NEXT: uaddl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v0.8b, v6.8b ; CHECK-NEXT: ushll v0.4s, v16.4h, #3 ; CHECK-NEXT: ushll2 v6.4s, v16.8h, #3 diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll index d9d80f1cb50ee..1fbca7ca2c27c 100644 --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -118,12 +118,12 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) { ; CHECK-NEXT: add x10, x0, x9 ; CHECK-NEXT: add x9, x1, x9 ; CHECK-NEXT: ldp q2, q1, [x10] -; CHECK-NEXT: fcvtzu.4s v5, v1 -; CHECK-NEXT: ldp q1, q3, [x9] -; CHECK-NEXT: fcvtzu.4s v4, v2 -; CHECK-NEXT: fcvtzu.4s v7, v3 +; CHECK-NEXT: fcvtzu.4s v4, v1 +; CHECK-NEXT: ldp q7, q1, [x9] +; CHECK-NEXT: fcvtzu.4s v3, v2 ; CHECK-NEXT: fcvtzu.4s v6, v1 -; CHECK-NEXT: tbl.16b v1, { v4, v5, v6, v7 }, v0 +; CHECK-NEXT: fcvtzu.4s v5, v7 +; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0 ; CHECK-NEXT: str q1, [x2, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 @@ -185,12 +185,12 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, p ; CHECK-NEXT: add x10, x0, x9 ; CHECK-NEXT: add x9, x1, x9 ; CHECK-NEXT: ldp q2, q1, [x10] -; CHECK-NEXT: fcvtzu.4s v5, v1 -; CHECK-NEXT: ldp q1, q3, [x9] -; CHECK-NEXT: fcvtzu.4s v4, v2 -; CHECK-NEXT: fcvtzu.4s v7, v3 +; CHECK-NEXT: fcvtzu.4s v4, v1 +; CHECK-NEXT: ldp q7, q1, [x9] +; CHECK-NEXT: fcvtzu.4s v3, v2 ; CHECK-NEXT: fcvtzu.4s v6, v1 -; CHECK-NEXT: tbl.16b v1, { v4, v5, v6, v7 }, v0 +; CHECK-NEXT: fcvtzu.4s v5, v7 +; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0 ; CHECK-NEXT: str q1, [x2, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 @@ -252,12 +252,12 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: ldp q2, q1, [x9, #32] -; CHECK-NEXT: fcvtzu.4s v7, v1 -; CHECK-NEXT: ldp q1, q3, [x9] -; CHECK-NEXT: fcvtzu.4s v6, v2 -; CHECK-NEXT: fcvtzu.4s v5, v3 +; CHECK-NEXT: fcvtzu.4s v6, v1 +; CHECK-NEXT: ldp q7, q1, [x9] +; CHECK-NEXT: fcvtzu.4s v5, v2 ; CHECK-NEXT: fcvtzu.4s v4, v1 -; CHECK-NEXT: tbl.16b v1, { v4, v5, v6, v7 }, v0 +; CHECK-NEXT: fcvtzu.4s v3, v7 +; CHECK-NEXT: tbl.16b v1, { v3, v4, v5, v6 }, v0 ; CHECK-NEXT: str q1, [x1], #32 ; CHECK-NEXT: b.eq LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %exit diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 8d9a6e6b92914..39ce70f88d6ba 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -576,17 +576,17 @@ define <3 x i128> @v3i128(<3 x i128> %d, <3 x i128> %e) { ; CHECK-GI-NEXT: mul x9, x2, x10 ; CHECK-GI-NEXT: umulh x14, x2, x10 ; CHECK-GI-NEXT: madd x10, x3, x10, x13 -; CHECK-GI-NEXT: ldp x13, x15, [sp, #16] +; CHECK-GI-NEXT: ldp x15, x13, [sp, #16] ; CHECK-GI-NEXT: mov x2, x9 ; CHECK-GI-NEXT: umulh x11, x0, x6 ; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: mul x15, x4, x15 +; CHECK-GI-NEXT: mul x13, x4, x13 ; CHECK-GI-NEXT: add x3, x10, x14 -; CHECK-GI-NEXT: umulh x16, x4, x13 +; CHECK-GI-NEXT: umulh x16, x4, x15 ; CHECK-GI-NEXT: add x1, x12, x11 -; CHECK-GI-NEXT: madd x15, x5, x13, x15 -; CHECK-GI-NEXT: mul x4, x4, x13 -; CHECK-GI-NEXT: add x5, x15, x16 +; CHECK-GI-NEXT: madd x13, x5, x15, x13 +; CHECK-GI-NEXT: mul x4, x4, x15 +; CHECK-GI-NEXT: add x5, x13, x16 ; CHECK-GI-NEXT: ret entry: %s = mul <3 x i128> %d, %e @@ -638,14 +638,14 @@ define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { ; CHECK-GI-NEXT: umulh x17, x4, x15 ; CHECK-GI-NEXT: add x3, x13, x14 ; CHECK-GI-NEXT: madd x15, x5, x15, x16 -; CHECK-GI-NEXT: ldp x16, x18, [sp, #48] +; CHECK-GI-NEXT: ldp x18, x16, [sp, #48] ; CHECK-GI-NEXT: mov x4, x10 -; CHECK-GI-NEXT: mul x18, x6, x18 -; CHECK-GI-NEXT: umulh x0, x6, x16 +; CHECK-GI-NEXT: mul x16, x6, x16 +; CHECK-GI-NEXT: umulh x0, x6, x18 ; CHECK-GI-NEXT: add x5, x15, x17 -; CHECK-GI-NEXT: madd x18, x7, x16, x18 -; CHECK-GI-NEXT: mul x6, x6, x16 -; CHECK-GI-NEXT: add x7, x18, x0 +; CHECK-GI-NEXT: madd x16, x7, x18, x16 +; CHECK-GI-NEXT: mul x6, x6, x18 +; CHECK-GI-NEXT: add x7, x16, x0 ; CHECK-GI-NEXT: mov x0, x8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll index c40e529ccab1b..cc666dd8d34e6 100644 --- a/llvm/test/CodeGen/AArch64/nzcv-save.ll +++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll @@ -6,19 +6,19 @@ define void @f(ptr nocapture %a, ptr nocapture %b, ptr nocapture %cc, ptr nocapture %dd) nounwind uwtable noinline ssp { ; CHECK-LABEL: f: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x10, [x2] -; CHECK-NEXT: ldp x9, x11, [x3] +; CHECK-NEXT: ldp x8, x11, [x3] +; CHECK-NEXT: ldp x9, x10, [x2] ; CHECK-NEXT: ldp x13, x12, [x2, #16] -; CHECK-NEXT: adds x8, x8, x9 -; CHECK-NEXT: ldp x14, x9, [x3, #16] +; CHECK-NEXT: adds x8, x9, x8 +; CHECK-NEXT: ldp x9, x14, [x3, #16] ; CHECK-NEXT: adcs x10, x10, x11 ; CHECK-NEXT: stp x8, x10, [x0] -; CHECK-NEXT: adcs x11, x13, x14 -; CHECK-NEXT: adc x13, x12, x9 +; CHECK-NEXT: adcs x9, x13, x9 +; CHECK-NEXT: adc x11, x12, x14 ; CHECK-NEXT: orr x12, x12, #0x100 -; CHECK-NEXT: adc x9, x12, x9 -; CHECK-NEXT: stp x11, x13, [x0, #16] -; CHECK-NEXT: stp x11, x9, [x1, #16] +; CHECK-NEXT: stp x9, x11, [x0, #16] +; CHECK-NEXT: adc x11, x12, x14 +; CHECK-NEXT: stp x9, x11, [x1, #16] ; CHECK-NEXT: stp x8, x10, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 5f6b60a767f9d..4fe303b9bbf46 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -26,18 +26,18 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] ; NONEON-NOSVE-NEXT: fcvtzs w8, s0, #3 -; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp, #16] ; NONEON-NOSVE-NEXT: fcvtzs w9, s1, #3 -; NONEON-NOSVE-NEXT: fcvtzs w10, s2, #3 -; NONEON-NOSVE-NEXT: fcvtzs w11, s0, #3 -; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] -; NONEON-NOSVE-NEXT: ldp s0, s3, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w10, s0, #3 +; NONEON-NOSVE-NEXT: fcvtzs w11, s2, #3 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s3, [sp] ; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #56] -; NONEON-NOSVE-NEXT: fcvtzs w12, s1, #3 -; NONEON-NOSVE-NEXT: fcvtzs w8, s2, #3 +; NONEON-NOSVE-NEXT: fcvtzs w12, s0, #3 +; NONEON-NOSVE-NEXT: fcvtzs w8, s1, #3 ; NONEON-NOSVE-NEXT: stp w11, w10, [sp, #48] ; NONEON-NOSVE-NEXT: fcvtzs w9, s3, #3 -; NONEON-NOSVE-NEXT: fcvtzs w10, s0, #3 +; NONEON-NOSVE-NEXT: fcvtzs w10, s2, #3 ; NONEON-NOSVE-NEXT: stp w8, w12, [sp, #40] ; NONEON-NOSVE-NEXT: stp w10, w9, [sp, #32] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ad5f91a5f39a4..ec0693a541e44 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -194,12 +194,12 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -429,12 +429,12 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -553,12 +553,12 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 97f2e7a1e66cb..0c97eedd4362d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -973,11 +973,11 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] ; NONEON-NOSVE-NEXT: ldpsw x13, x12, [sp, #48] ; NONEON-NOSVE-NEXT: smull x11, w11, w12 -; NONEON-NOSVE-NEXT: ldpsw x12, x14, [sp, #56] +; NONEON-NOSVE-NEXT: ldpsw x14, x12, [sp, #56] ; NONEON-NOSVE-NEXT: smull x10, w10, w13 ; NONEON-NOSVE-NEXT: lsr x11, x11, #32 -; NONEON-NOSVE-NEXT: smull x9, w9, w14 -; NONEON-NOSVE-NEXT: smull x8, w8, w12 +; NONEON-NOSVE-NEXT: smull x9, w9, w12 +; NONEON-NOSVE-NEXT: smull x8, w8, w14 ; NONEON-NOSVE-NEXT: lsr x10, x10, #32 ; NONEON-NOSVE-NEXT: lsr x9, x9, #32 ; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] @@ -1038,12 +1038,12 @@ define void @smulh_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] ; NONEON-NOSVE-NEXT: ldpsw x17, x16, [sp, #112] ; NONEON-NOSVE-NEXT: smull x15, w15, w16 -; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #120] +; NONEON-NOSVE-NEXT: ldpsw x18, x16, [sp, #120] ; NONEON-NOSVE-NEXT: smull x14, w14, w17 ; NONEON-NOSVE-NEXT: ldpsw x17, x1, [sp, #80] -; NONEON-NOSVE-NEXT: smull x13, w13, w18 +; NONEON-NOSVE-NEXT: smull x13, w13, w16 ; NONEON-NOSVE-NEXT: lsr x15, x15, #32 -; NONEON-NOSVE-NEXT: smull x12, w12, w16 +; NONEON-NOSVE-NEXT: smull x12, w12, w18 ; NONEON-NOSVE-NEXT: lsr x14, x14, #32 ; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #88] ; NONEON-NOSVE-NEXT: smull x11, w11, w1 @@ -2172,11 +2172,11 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] ; NONEON-NOSVE-NEXT: ldp w13, w12, [sp, #48] ; NONEON-NOSVE-NEXT: umull x11, w11, w12 -; NONEON-NOSVE-NEXT: ldp w12, w14, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #56] ; NONEON-NOSVE-NEXT: umull x10, w10, w13 ; NONEON-NOSVE-NEXT: lsr x11, x11, #32 -; NONEON-NOSVE-NEXT: umull x9, w9, w14 -; NONEON-NOSVE-NEXT: umull x8, w8, w12 +; NONEON-NOSVE-NEXT: umull x9, w9, w12 +; NONEON-NOSVE-NEXT: umull x8, w8, w14 ; NONEON-NOSVE-NEXT: lsr x10, x10, #32 ; NONEON-NOSVE-NEXT: lsr x9, x9, #32 ; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] @@ -2237,12 +2237,12 @@ define void @umulh_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] ; NONEON-NOSVE-NEXT: ldp w17, w16, [sp, #112] ; NONEON-NOSVE-NEXT: umull x15, w15, w16 -; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #120] +; NONEON-NOSVE-NEXT: ldp w18, w16, [sp, #120] ; NONEON-NOSVE-NEXT: umull x14, w14, w17 ; NONEON-NOSVE-NEXT: ldp w17, w1, [sp, #80] -; NONEON-NOSVE-NEXT: umull x13, w13, w18 +; NONEON-NOSVE-NEXT: umull x13, w13, w16 ; NONEON-NOSVE-NEXT: lsr x15, x15, #32 -; NONEON-NOSVE-NEXT: umull x12, w12, w16 +; NONEON-NOSVE-NEXT: umull x12, w12, w18 ; NONEON-NOSVE-NEXT: lsr x14, x14, #32 ; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #88] ; NONEON-NOSVE-NEXT: umull x11, w11, w1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 41eb731fd66df..39701131d7db6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -288,12 +288,12 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b -; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b +; CHECK-NEXT: mov z0.b, p1/m, z1.b ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -692,12 +692,12 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -906,12 +906,12 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -930,16 +930,16 @@ define void @select_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: cmp w10, w9 ; NONEON-NOSVE-NEXT: csel w9, w10, w9, eq ; NONEON-NOSVE-NEXT: cmp w13, w12 -; NONEON-NOSVE-NEXT: ldp w15, w16, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w10, w16, [sp, #48] ; NONEON-NOSVE-NEXT: csel w12, w13, w12, eq ; NONEON-NOSVE-NEXT: cmp w14, w11 -; NONEON-NOSVE-NEXT: ldp w10, w13, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #32] ; NONEON-NOSVE-NEXT: csel w11, w14, w11, eq ; NONEON-NOSVE-NEXT: ldp w17, w14, [sp, #56] ; NONEON-NOSVE-NEXT: ldp w18, w1, [sp, #40] -; NONEON-NOSVE-NEXT: cmp w10, w15 +; NONEON-NOSVE-NEXT: cmp w15, w10 ; NONEON-NOSVE-NEXT: stp w12, w11, [sp, #72] -; NONEON-NOSVE-NEXT: csel w10, w10, w15, eq +; NONEON-NOSVE-NEXT: csel w10, w15, w10, eq ; NONEON-NOSVE-NEXT: cmp w13, w16 ; NONEON-NOSVE-NEXT: ldr w15, [sp] ; NONEON-NOSVE-NEXT: csel w13, w13, w16, eq @@ -1039,12 +1039,12 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret @@ -1057,13 +1057,13 @@ define void @select_v4i64(ptr %a, ptr %b) { ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 ; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] ; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] -; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #24] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #24] ; NONEON-NOSVE-NEXT: ldr x13, [sp, #40] -; NONEON-NOSVE-NEXT: ldp x10, x12, [sp, #48] +; NONEON-NOSVE-NEXT: ldp x11, x12, [sp, #48] ; NONEON-NOSVE-NEXT: cmp x9, x8 ; NONEON-NOSVE-NEXT: csel x8, x9, x8, eq -; NONEON-NOSVE-NEXT: cmp x11, x10 -; NONEON-NOSVE-NEXT: csel x9, x11, x10, eq +; NONEON-NOSVE-NEXT: cmp x10, x11 +; NONEON-NOSVE-NEXT: csel x9, x10, x11, eq ; NONEON-NOSVE-NEXT: ldr x10, [sp, #16] ; NONEON-NOSVE-NEXT: ldr x11, [sp] ; NONEON-NOSVE-NEXT: cmp x13, x12 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 3d9f407c3064c..e0e88c47fb55c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -151,20 +151,20 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mov z5.h, z0.h[7] -; CHECK-NEXT: mov z7.h, z0.h[6] -; CHECK-NEXT: mov z17.h, z0.h[5] -; CHECK-NEXT: mov z4.h, z3.h[7] -; CHECK-NEXT: mov z6.h, z3.h[6] -; CHECK-NEXT: mov z16.h, z3.h[5] -; CHECK-NEXT: mov z18.h, z3.h[4] -; CHECK-NEXT: mov z19.h, z0.h[4] -; CHECK-NEXT: mov z20.h, z2.h[7] -; CHECK-NEXT: mov z21.h, z1.h[7] -; CHECK-NEXT: mov z22.h, z2.h[6] -; CHECK-NEXT: mov z23.h, z1.h[6] +; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: mov z4.h, z2.h[7] +; CHECK-NEXT: mov z6.h, z2.h[6] +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: mov z5.h, z3.h[7] +; CHECK-NEXT: mov z7.h, z3.h[6] +; CHECK-NEXT: mov z17.h, z3.h[5] +; CHECK-NEXT: mov z18.h, z2.h[4] +; CHECK-NEXT: mov z19.h, z3.h[4] +; CHECK-NEXT: mov z20.h, z1.h[7] +; CHECK-NEXT: mov z21.h, z0.h[7] +; CHECK-NEXT: mov z22.h, z1.h[6] +; CHECK-NEXT: mov z23.h, z0.h[6] ; CHECK-NEXT: zip1 z24.h, z5.h, z4.h ; CHECK-NEXT: zip1 z25.h, z7.h, z6.h ; CHECK-NEXT: zip1 z16.h, z17.h, z16.h @@ -174,12 +174,12 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z18.h, z21.h, z20.h ; CHECK-NEXT: zip1 z21.s, z25.s, z24.s ; CHECK-NEXT: zip1 z22.h, z23.h, z22.h -; CHECK-NEXT: mov z23.h, z2.h[5] +; CHECK-NEXT: mov z23.h, z1.h[5] ; CHECK-NEXT: mov z20.h, z6.h[7] -; CHECK-NEXT: mov z24.h, z1.h[5] -; CHECK-NEXT: mov z25.h, z2.h[4] +; CHECK-NEXT: mov z24.h, z0.h[5] +; CHECK-NEXT: mov z25.h, z1.h[4] ; CHECK-NEXT: mov z19.h, z7.h[7] -; CHECK-NEXT: mov z26.h, z1.h[4] +; CHECK-NEXT: mov z26.h, z0.h[4] ; CHECK-NEXT: mov z27.h, z6.h[6] ; CHECK-NEXT: mov z28.h, z7.h[5] ; CHECK-NEXT: mov z29.h, z6.h[5] @@ -212,22 +212,22 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z19.s, z28.s, z27.s ; CHECK-NEXT: zip1 z18.s, z22.s, z18.s ; CHECK-NEXT: zip1 z20.s, z24.s, z23.s -; CHECK-NEXT: zip1 z0.h, z0.h, z3.h +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h ; CHECK-NEXT: zip1 z3.s, z26.s, z25.s ; CHECK-NEXT: zip1 z22.s, z30.s, z29.s ; CHECK-NEXT: zip1 z6.h, z6.h, z7.h ; CHECK-NEXT: zip1 z7.d, z16.d, z21.d ; CHECK-NEXT: zip1 z16.d, z19.d, z17.d -; CHECK-NEXT: zip1 z1.h, z1.h, z2.h -; CHECK-NEXT: zip1 z2.h, z4.h, z5.h +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: zip1 z1.h, z4.h, z5.h ; CHECK-NEXT: zip1 z4.d, z20.d, z18.d ; CHECK-NEXT: zip1 z3.d, z22.d, z3.d -; CHECK-NEXT: add z0.h, z0.h, z6.h +; CHECK-NEXT: add z2.h, z2.h, z6.h ; CHECK-NEXT: add z5.h, z7.h, z16.h -; CHECK-NEXT: add z1.h, z1.h, z2.h -; CHECK-NEXT: add z2.h, z4.h, z3.h -; CHECK-NEXT: stp q0, q5, [x0, #32] -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: add z1.h, z4.h, z3.h +; CHECK-NEXT: stp q2, q5, [x0, #32] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret ; @@ -659,10 +659,10 @@ define void @zip1_v8i32_undef(ptr %a) { define void @trn_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: trn1 z4.b, z0.b, z1.b -; CHECK-NEXT: trn2 z0.b, z0.b, z1.b +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: trn1 z4.b, z1.b, z0.b +; CHECK-NEXT: trn2 z0.b, z1.b, z0.b ; CHECK-NEXT: trn1 z1.b, z2.b, z3.b ; CHECK-NEXT: trn2 z2.b, z2.b, z3.b ; CHECK-NEXT: add z0.b, z4.b, z0.b @@ -862,10 +862,10 @@ define void @trn_v8i16(ptr %a, ptr %b) { define void @trn_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: trn1 z4.h, z0.h, z1.h -; CHECK-NEXT: trn2 z0.h, z0.h, z1.h +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: trn1 z4.h, z1.h, z0.h +; CHECK-NEXT: trn2 z0.h, z1.h, z0.h ; CHECK-NEXT: trn1 z1.h, z2.h, z3.h ; CHECK-NEXT: trn2 z2.h, z2.h, z3.h ; CHECK-NEXT: add z0.h, z4.h, z0.h @@ -961,10 +961,10 @@ define void @trn_v16i16(ptr %a, ptr %b) { define void @trn_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: zip1 z4.s, z0.s, z1.s -; CHECK-NEXT: trn2 z0.s, z0.s, z1.s +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: zip1 z4.s, z1.s, z0.s +; CHECK-NEXT: trn2 z0.s, z1.s, z0.s ; CHECK-NEXT: trn1 z1.s, z2.s, z3.s ; CHECK-NEXT: trn2 z2.s, z2.s, z3.s ; CHECK-NEXT: add z0.s, z4.s, z0.s @@ -1006,11 +1006,11 @@ define void @trn_v8i32(ptr %a, ptr %b) { define void @trn_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: zip1 z4.d, z0.d, z1.d -; CHECK-NEXT: trn2 z0.d, z0.d, z1.d +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: zip1 z4.d, z1.d, z0.d +; CHECK-NEXT: trn2 z0.d, z1.d, z0.d ; CHECK-NEXT: zip1 z1.d, z2.d, z3.d ; CHECK-NEXT: trn2 z2.d, z2.d, z3.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z4.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index e07036f2a1acf..90466e3cebd5e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -125,14 +125,14 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: ldp q1, q6, [x1] +; CHECK-NEXT: ldp q6, q1, [x1] ; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 ; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z6.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff @@ -334,14 +334,14 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: ldp q1, q6, [x1] +; CHECK-NEXT: ldp q6, q1, [x1] ; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 ; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z6.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 46a2459485987..75c5bee2ae0ab 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2913,28 +2913,28 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-NEXT: LBB25_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q4, [x0] -; CHECK-NEXT: ldp q17, q7, [x1, #32] +; CHECK-NEXT: ldp q17, q16, [x1, #32] ; CHECK-NEXT: ldr q18, [x1] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: tbl.16b v5, { v4 }, v3 ; CHECK-NEXT: tbl.16b v6, { v4 }, v0 -; CHECK-NEXT: tbl.16b v16, { v4 }, v1 +; CHECK-NEXT: tbl.16b v7, { v4 }, v1 ; CHECK-NEXT: tbl.16b v4, { v4 }, v2 ; CHECK-NEXT: ldr q21, [x8, #16]! ; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: umull2.2d v19, v5, v7 +; CHECK-NEXT: umull2.2d v19, v5, v16 ; CHECK-NEXT: umull2.2d v20, v6, v17 -; CHECK-NEXT: umull2.2d v22, v16, v18 -; CHECK-NEXT: umull.2d v5, v5, v7 -; CHECK-NEXT: umull2.2d v7, v4, v21 +; CHECK-NEXT: umull2.2d v22, v7, v18 +; CHECK-NEXT: umull.2d v5, v5, v16 +; CHECK-NEXT: umull2.2d v16, v4, v21 ; CHECK-NEXT: umull.2d v4, v4, v21 -; CHECK-NEXT: umull.2d v16, v16, v18 +; CHECK-NEXT: umull.2d v7, v7, v18 ; CHECK-NEXT: umull.2d v6, v6, v17 ; CHECK-NEXT: str q20, [x0, #80] ; CHECK-NEXT: stp q22, q4, [x0, #16] ; CHECK-NEXT: stp q5, q19, [x0, #96] -; CHECK-NEXT: str q7, [x0, #48] -; CHECK-NEXT: str q16, [x0] +; CHECK-NEXT: str q16, [x0, #48] +; CHECK-NEXT: str q7, [x0] ; CHECK-NEXT: str q6, [x0, #64]! ; CHECK-NEXT: b.ne LBB25_1 ; CHECK-NEXT: ; %bb.2: ; %exit diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index 27b93872b9f1d..b67080bd4798d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -40,17 +40,17 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ushort v8, v[2:3], off -; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v7, v[2:3], off +; GFX9-NEXT: global_load_ushort v8, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:4 ; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2 ; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_pk_add_u16 v2, v7, v9 +; GFX9-NEXT: v_pk_add_u16 v2, v9, v8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -206,10 +206,10 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 -; GFX9-NEXT: global_load_ushort v9, v[2:3], off -; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4 -; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v8, v[2:3], off +; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:8 ; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2 ; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6 ; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2 @@ -218,12 +218,12 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_pk_add_u16 v6, v8, v11 +; GFX9-NEXT: v_pk_add_u16 v6, v11, v10 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -421,11 +421,11 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 -; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12 -; GFX9-NEXT: global_load_ushort v10, v[2:3], off -; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4 -; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8 -; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:12 +; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:12 ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2 ; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6 ; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10 @@ -438,14 +438,14 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v10 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_pk_add_u16 v8, v9, v13 +; GFX9-NEXT: v_pk_add_u16 v8, v13, v12 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -720,8 +720,8 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off -; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:20 -; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20 +; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20 +; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20 ; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18 ; GFX9-NEXT: s_waitcnt vmcnt(6) @@ -738,7 +738,7 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: v_pk_add_u16 v6, v16, v17 +; GFX9-NEXT: v_pk_add_u16 v6, v17, v16 ; GFX9-NEXT: v_pk_add_u16 v0, v7, v8 ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 44abfd272be88..37db1c56022e8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -3121,8 +3121,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -3284,7 +3284,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3522,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 @@ -4334,8 +4333,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -4990,13 +4989,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 @@ -5016,13 +5016,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 @@ -5042,13 +5043,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -5068,13 +5070,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 @@ -5094,13 +5097,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 @@ -5119,13 +5123,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 @@ -5145,13 +5150,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 @@ -5171,13 +5177,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 @@ -5195,13 +5202,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 @@ -5221,13 +5229,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 @@ -5293,8 +5302,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5484,7 +5493,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -5493,7 +5502,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -5689,7 +5697,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 @@ -6106,13 +6113,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 @@ -6131,13 +6139,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -6156,13 +6165,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -6181,13 +6191,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -6206,13 +6217,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -6231,13 +6243,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -6256,13 +6269,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 @@ -12441,10 +12455,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -12464,33 +12478,35 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12511,10 +12527,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12525,69 +12541,69 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -12602,14 +12618,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12631,16 +12647,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12659,19 +12675,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12690,10 +12706,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12762,7 +12778,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12780,121 +12796,119 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 ; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v48 @@ -12904,11 +12918,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v62 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v38 @@ -12918,18 +12932,18 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v46 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v44 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 @@ -12940,24 +12954,24 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 @@ -12966,86 +12980,85 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -13058,7 +13071,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 @@ -13079,7 +13092,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13092,7 +13105,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13105,7 +13118,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13118,7 +13131,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13131,7 +13144,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13144,7 +13157,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13214,7 +13227,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 @@ -13228,249 +13241,249 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -13479,16 +13492,18 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -13496,7 +13511,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 @@ -13509,11 +13524,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -13531,7 +13546,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -13549,7 +13564,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 @@ -13562,9 +13577,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 @@ -13579,7 +13594,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 @@ -13587,7 +13602,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -13595,7 +13610,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13608,12 +13623,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -13626,7 +13641,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -13634,19 +13649,17 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload @@ -13656,7 +13669,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13665,7 +13678,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13673,7 +13686,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13682,7 +13695,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13690,7 +13703,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13699,7 +13712,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13707,7 +13720,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13724,7 +13737,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13751,7 +13764,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -13768,7 +13781,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 @@ -13785,7 +13798,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 @@ -13802,7 +13815,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 @@ -13819,7 +13832,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 @@ -13836,7 +13849,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -13928,7 +13941,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 @@ -14009,19 +14022,18 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -14053,59 +14065,61 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -14114,25 +14128,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -14140,25 +14154,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -14166,25 +14180,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -14192,10 +14206,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -14207,10 +14221,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -14221,22 +14235,22 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -14247,14 +14261,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -14266,34 +14280,33 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -14301,19 +14314,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -14324,23 +14337,23 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -14381,147 +14394,217 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -14531,16 +14614,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -14549,7 +14632,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -14558,253 +14641,190 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v61 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -14813,31 +14833,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -14874,13 +14888,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v8, 3, v62 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_add_u16_e32 v9, 3, v32 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 @@ -14888,27 +14902,28 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v57 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -14919,148 +14934,149 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v15, 3, v40 -; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 @@ -15074,19 +15090,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 @@ -15100,20 +15116,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -15126,7 +15142,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -15139,7 +15155,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -15148,7 +15164,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB14_4: ; %end @@ -15225,19 +15241,18 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -15249,93 +15264,95 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -15345,25 +15362,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -15372,25 +15389,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -15399,25 +15416,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -15426,10 +15443,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -15441,10 +15458,10 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -15456,22 +15473,22 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -15483,14 +15500,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -15502,34 +15519,33 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -15537,20 +15553,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -15561,23 +15577,23 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -15618,147 +15634,217 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -15768,16 +15854,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -15786,7 +15872,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -15795,258 +15881,196 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -16055,32 +16079,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -16117,41 +16135,41 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -16162,148 +16180,149 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 @@ -16317,19 +16336,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -16343,20 +16362,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -16369,7 +16388,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -16382,7 +16401,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -16391,7 +16410,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB14_4: ; %end @@ -19122,8 +19141,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_cbranch_vccnz .LBB15_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 @@ -19178,7 +19197,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 @@ -19300,12 +19319,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -20220,8 +20239,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -20273,7 +20292,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 @@ -20368,11 +20387,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -20495,11 +20514,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -21284,11 +21303,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -21500,11 +21519,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -23572,9 +23591,9 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -23607,7 +23626,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -23726,7 +23745,6 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23831,7 +23849,6 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 @@ -25530,6 +25547,8 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -25606,8 +25625,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 @@ -31871,9 +31888,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -31906,7 +31923,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -31979,7 +31996,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -35061,9 +35077,9 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -35113,9 +35129,8 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -35847,8 +35862,8 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -35882,7 +35897,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35933,7 +35948,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 @@ -37049,26 +37063,25 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 @@ -37081,102 +37094,107 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -37193,9 +37211,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -37225,83 +37241,71 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v16, v16, v53 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v19, v19, v48 -; SI-NEXT: v_or_b32_e32 v21, v21, v36 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v39 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v62 ; SI-NEXT: v_or_b32_e32 v2, v2, v61 @@ -37314,13 +37318,15 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v46 ; SI-NEXT: v_or_b32_e32 v10, v10, v45 ; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v42 -; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -37334,63 +37340,79 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v31, v31, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB26_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -37407,69 +37429,68 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_or_b32_e32 v21, v36, v21 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -37486,8 +37507,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_or_b32_e32 v1, v62, v1 ; SI-NEXT: v_or_b32_e32 v2, v61, v2 @@ -37500,28 +37520,25 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v46, v9 ; SI-NEXT: v_or_b32_e32 v10, v45, v10 ; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 ; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 @@ -37531,34 +37548,37 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_or_b32_e32 v31, v38, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB26_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -37578,7 +37598,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v32i32: @@ -40846,8 +40866,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -41009,7 +41029,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -41247,7 +41267,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -42059,8 +42078,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -42715,13 +42734,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 @@ -42741,13 +42761,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 @@ -42767,13 +42788,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -42793,13 +42815,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 @@ -42819,13 +42842,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 @@ -42844,13 +42868,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 @@ -42870,13 +42895,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 @@ -42896,13 +42922,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 @@ -42920,13 +42947,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 @@ -42946,13 +42974,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 @@ -43018,8 +43047,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -43209,7 +43238,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -43218,7 +43247,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -43414,7 +43442,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 ; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -43831,13 +43858,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 @@ -43856,13 +43884,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -43881,13 +43910,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -43906,13 +43936,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -43931,13 +43962,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -43956,13 +43988,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -43981,13 +44014,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 @@ -51153,10 +51187,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -51176,33 +51210,35 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51223,10 +51259,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51237,69 +51273,69 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -51314,14 +51350,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51343,16 +51379,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51371,19 +51407,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51402,10 +51438,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51474,7 +51510,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -51492,121 +51528,119 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 ; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v48 @@ -51616,11 +51650,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v62 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v38 @@ -51630,18 +51664,18 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v46 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v44 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 @@ -51652,24 +51686,24 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 @@ -51678,86 +51712,85 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -51770,7 +51803,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 @@ -51791,7 +51824,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51804,7 +51837,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51817,7 +51850,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51830,7 +51863,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51843,7 +51876,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51856,7 +51889,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51926,7 +51959,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 @@ -51940,249 +51973,249 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB38_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -52191,16 +52224,18 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -52208,7 +52243,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 @@ -52221,11 +52256,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -52243,7 +52278,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -52261,7 +52296,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 @@ -52274,9 +52309,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 @@ -52291,7 +52326,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 @@ -52299,7 +52334,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -52307,7 +52342,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52320,12 +52355,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -52338,7 +52373,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -52346,19 +52381,17 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload @@ -52368,7 +52401,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52377,7 +52410,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -52385,7 +52418,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52394,7 +52427,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -52402,7 +52435,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52411,7 +52444,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -52419,7 +52452,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52436,7 +52469,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -52463,7 +52496,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -52480,7 +52513,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 @@ -52497,7 +52530,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 @@ -52514,7 +52547,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 @@ -52531,7 +52564,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 @@ -52548,7 +52581,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -52640,7 +52673,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 @@ -52721,19 +52754,18 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -52765,59 +52797,61 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -52826,25 +52860,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -52852,25 +52886,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -52878,25 +52912,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -52904,10 +52938,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -52919,10 +52953,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -52933,22 +52967,22 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -52959,14 +52993,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -52978,34 +53012,33 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -53013,19 +53046,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -53036,23 +53069,23 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -53093,147 +53126,217 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -53243,16 +53346,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -53261,7 +53364,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -53270,253 +53373,190 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v61 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -53525,31 +53565,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -53586,13 +53620,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v8, 3, v62 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_add_u16_e32 v9, 3, v32 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 @@ -53600,27 +53634,28 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v57 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -53631,148 +53666,149 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v15, 3, v40 -; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 @@ -53786,19 +53822,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 @@ -53812,20 +53848,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -53838,7 +53874,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -53851,7 +53887,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -53860,7 +53896,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB38_4: ; %end @@ -53937,19 +53973,18 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -53961,93 +53996,95 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -54057,25 +54094,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -54084,25 +54121,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -54111,25 +54148,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -54138,10 +54175,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -54153,10 +54190,10 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -54168,22 +54205,22 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -54195,14 +54232,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -54214,34 +54251,33 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -54249,20 +54285,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -54273,23 +54309,23 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -54330,147 +54366,217 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -54480,16 +54586,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -54498,7 +54604,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -54507,258 +54613,196 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -54767,32 +54811,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -54829,41 +54867,41 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -54874,148 +54912,149 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 @@ -55029,19 +55068,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -55055,20 +55094,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -55081,7 +55120,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -55094,7 +55133,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -55103,7 +55142,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB38_4: ; %end @@ -57834,8 +57873,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_cbranch_vccnz .LBB39_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 @@ -57890,7 +57929,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 @@ -58012,12 +58051,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -58932,8 +58971,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -58985,7 +59024,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 @@ -59080,11 +59119,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -59207,11 +59246,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -59996,11 +60035,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -60212,11 +60251,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -62284,9 +62323,9 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -62319,7 +62358,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -62438,7 +62477,6 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -62543,7 +62581,6 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v63 @@ -64288,6 +64325,8 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -64364,8 +64403,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 @@ -70629,9 +70666,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -70664,7 +70701,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -70737,7 +70774,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -73792,9 +73828,9 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -73844,9 +73880,8 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -74578,8 +74613,8 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -74613,7 +74648,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -74664,7 +74699,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -75760,26 +75794,25 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 @@ -75792,102 +75825,107 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -75904,9 +75942,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -75936,83 +75972,71 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v16, v16, v53 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v19, v19, v48 -; SI-NEXT: v_or_b32_e32 v21, v21, v36 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v39 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v62 ; SI-NEXT: v_or_b32_e32 v2, v2, v61 @@ -76025,13 +76049,15 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v46 ; SI-NEXT: v_or_b32_e32 v10, v10, v45 ; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v42 -; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -76045,63 +76071,79 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v31, v31, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -76118,69 +76160,68 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_or_b32_e32 v21, v36, v21 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -76197,8 +76238,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_or_b32_e32 v1, v62, v1 ; SI-NEXT: v_or_b32_e32 v2, v61, v2 @@ -76211,28 +76251,25 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v46, v9 ; SI-NEXT: v_or_b32_e32 v10, v45, v10 ; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 ; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 @@ -76242,34 +76279,37 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_or_b32_e32 v31, v38, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -76289,7 +76329,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v32f32: @@ -78543,8 +78583,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -78706,7 +78746,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -79755,8 +79795,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -80411,13 +80451,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 @@ -80437,13 +80478,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 @@ -80463,13 +80505,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -80489,13 +80532,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 @@ -80515,13 +80559,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 @@ -80540,13 +80585,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 @@ -80566,13 +80612,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 @@ -80592,13 +80639,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 @@ -80616,13 +80664,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 @@ -80642,13 +80691,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 @@ -80714,8 +80764,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -80905,7 +80955,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -80914,7 +80964,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -81526,13 +81575,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 @@ -81551,13 +81601,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -81576,13 +81627,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -81601,13 +81653,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -81626,13 +81679,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -81651,13 +81705,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -81676,13 +81731,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 @@ -87884,10 +87940,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -87907,33 +87963,35 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -87954,10 +88012,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -87968,69 +88026,69 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -88045,14 +88103,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -88074,16 +88132,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -88102,19 +88160,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -88133,10 +88191,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -88205,7 +88263,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -88223,121 +88281,119 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 ; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v48 @@ -88347,11 +88403,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v62 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v38 @@ -88361,18 +88417,18 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v46 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v44 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 @@ -88383,24 +88439,24 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 @@ -88409,86 +88465,85 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -88501,7 +88556,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 @@ -88522,7 +88577,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88535,7 +88590,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88548,7 +88603,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88561,7 +88616,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88574,7 +88629,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88587,7 +88642,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -88657,7 +88712,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 @@ -88671,249 +88726,249 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB58_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -88922,16 +88977,18 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -88939,7 +88996,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 @@ -88952,11 +89009,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -88974,7 +89031,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -88992,7 +89049,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 @@ -89005,9 +89062,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 @@ -89022,7 +89079,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 @@ -89030,7 +89087,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -89038,7 +89095,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89051,12 +89108,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -89069,7 +89126,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -89077,19 +89134,17 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload @@ -89099,7 +89154,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89108,7 +89163,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -89116,7 +89171,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89125,7 +89180,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -89133,7 +89188,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89142,7 +89197,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -89150,7 +89205,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89167,7 +89222,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89194,7 +89249,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -89211,7 +89266,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 @@ -89228,7 +89283,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 @@ -89245,7 +89300,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 @@ -89262,7 +89317,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 @@ -89279,7 +89334,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -89371,7 +89426,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 @@ -89452,19 +89507,18 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -89496,59 +89550,61 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -89557,25 +89613,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -89583,25 +89639,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -89609,25 +89665,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -89635,10 +89691,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -89650,10 +89706,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -89664,22 +89720,22 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -89690,14 +89746,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -89709,34 +89765,33 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -89744,19 +89799,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -89767,23 +89822,23 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -89824,147 +89879,217 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -89974,16 +90099,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -89992,7 +90117,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -90001,253 +90126,190 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: .LBB58_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v61 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -90256,31 +90318,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -90317,13 +90373,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v8, 3, v62 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_add_u16_e32 v9, 3, v32 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 @@ -90331,27 +90387,28 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v57 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -90362,148 +90419,149 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v15, 3, v40 -; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 @@ -90517,19 +90575,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 @@ -90543,20 +90601,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -90569,7 +90627,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -90582,7 +90640,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -90591,7 +90649,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB58_4: ; %end @@ -90668,19 +90726,18 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -90692,93 +90749,95 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -90788,25 +90847,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -90815,25 +90874,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -90842,25 +90901,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -90869,10 +90928,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -90884,10 +90943,10 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -90899,22 +90958,22 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -90926,14 +90985,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -90945,34 +91004,33 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -90980,20 +91038,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -91004,23 +91062,23 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -91061,147 +91119,217 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -91211,16 +91339,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -91229,7 +91357,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -91238,258 +91366,196 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: .LBB58_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -91498,32 +91564,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -91560,41 +91620,41 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -91605,148 +91665,149 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 @@ -91760,19 +91821,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -91786,20 +91847,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -91812,7 +91873,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -91825,7 +91886,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -91834,7 +91895,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB58_4: ; %end @@ -94565,8 +94626,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_cbranch_vccnz .LBB59_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 @@ -94621,7 +94682,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 @@ -94743,12 +94804,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -95663,8 +95724,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -95716,7 +95777,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 @@ -95811,11 +95872,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -95938,11 +95999,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -96727,11 +96788,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -96943,11 +97004,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -99015,9 +99076,9 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -99050,7 +99111,7 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -99169,7 +99230,6 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -100960,6 +101020,8 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -101036,8 +101098,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 @@ -107301,9 +107361,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -107336,7 +107396,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -107409,7 +107469,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -110506,9 +110565,9 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -110558,9 +110617,8 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -111292,8 +111350,8 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -111327,7 +111385,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -112507,26 +112565,25 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 @@ -112539,102 +112596,107 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -112651,11 +112713,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -112685,81 +112743,71 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v16, v16, v53 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v19, v19, v48 -; SI-NEXT: v_or_b32_e32 v21, v21, v36 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v39 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v62 ; SI-NEXT: v_or_b32_e32 v2, v2, v61 @@ -112772,13 +112820,15 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v46 ; SI-NEXT: v_or_b32_e32 v10, v10, v45 ; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v42 -; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -112792,63 +112842,79 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v31, v31, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -112865,69 +112931,68 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_or_b32_e32 v21, v36, v21 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -112944,8 +113009,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_or_b32_e32 v1, v62, v1 ; SI-NEXT: v_or_b32_e32 v2, v61, v2 @@ -112958,28 +113022,25 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v46, v9 ; SI-NEXT: v_or_b32_e32 v10, v45, v10 ; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 ; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 @@ -112989,34 +113050,37 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_or_b32_e32 v31, v38, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -113036,7 +113100,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16i64: @@ -114321,8 +114385,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 @@ -114484,7 +114548,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -115517,8 +115581,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -116158,13 +116222,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 @@ -116184,13 +116249,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 @@ -116210,13 +116276,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -116236,13 +116303,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 @@ -116262,13 +116330,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 @@ -116287,13 +116356,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 @@ -116313,13 +116383,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 @@ -116339,13 +116410,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 @@ -116390,13 +116462,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 @@ -116464,8 +116537,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; kill: killed $vgpr41 @@ -116659,14 +116732,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(32) +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -117268,13 +117340,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 @@ -117293,13 +117366,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -117318,13 +117392,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -117343,13 +117418,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -117368,13 +117444,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -117393,13 +117470,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -117418,13 +117496,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 @@ -122416,13 +122495,14 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 @@ -124558,10 +124638,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -124581,33 +124661,35 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124628,10 +124710,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124642,69 +124724,69 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -124719,14 +124801,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124748,16 +124830,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124776,19 +124858,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124807,10 +124889,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124879,7 +124961,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -124897,121 +124979,119 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 ; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v3, v40, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v48 @@ -125021,11 +125101,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v62 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v38 @@ -125035,18 +125115,18 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v46 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v8, v8, v44 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 @@ -125057,24 +125137,24 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 @@ -125083,86 +125163,85 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -125175,7 +125254,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 @@ -125196,7 +125275,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125209,7 +125288,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125222,7 +125301,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125235,7 +125314,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125248,7 +125327,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125261,7 +125340,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125331,7 +125410,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 @@ -125345,249 +125424,249 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB74_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v54, v9 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v42, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -125596,16 +125675,18 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -125613,7 +125694,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v41, v1 @@ -125626,11 +125707,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -125648,7 +125729,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -125666,7 +125747,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v56, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 @@ -125679,9 +125760,9 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v8, v44, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 @@ -125696,7 +125777,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 @@ -125704,7 +125785,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -125712,7 +125793,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125725,12 +125806,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -125743,7 +125824,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -125751,19 +125832,17 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload @@ -125773,7 +125852,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125782,7 +125861,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -125790,7 +125869,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125799,7 +125878,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -125807,7 +125886,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125816,7 +125895,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -125824,7 +125903,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125841,7 +125920,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -125868,7 +125947,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -125885,7 +125964,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 @@ -125902,7 +125981,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 @@ -125919,7 +125998,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 @@ -125936,7 +126015,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 @@ -125953,7 +126032,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -126045,7 +126124,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 @@ -126126,19 +126205,18 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -126170,59 +126248,61 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -126231,25 +126311,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -126257,25 +126337,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -126283,25 +126363,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -126309,10 +126389,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -126324,10 +126404,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -126338,22 +126418,22 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -126364,14 +126444,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -126383,34 +126463,33 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -126418,19 +126497,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -126441,23 +126520,23 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -126498,147 +126577,217 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -126648,16 +126797,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -126666,7 +126815,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -126675,253 +126824,190 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v61 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -126930,31 +127016,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -126991,13 +127071,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v8, 3, v62 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_add_u16_e32 v9, 3, v32 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 @@ -127005,27 +127085,28 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v57 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -127036,148 +127117,149 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v15, 3, v40 -; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 @@ -127191,19 +127273,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 @@ -127217,20 +127299,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -127243,7 +127325,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -127256,7 +127338,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -127265,7 +127347,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB74_4: ; %end @@ -127342,19 +127424,18 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 ; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 @@ -127366,93 +127447,95 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -127462,25 +127545,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -127489,25 +127572,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -127516,25 +127599,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -127543,10 +127626,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 @@ -127558,10 +127641,10 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -127573,22 +127656,22 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -127600,14 +127683,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 @@ -127619,34 +127702,33 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -127654,20 +127736,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload @@ -127678,23 +127760,23 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -127735,147 +127817,217 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -127885,16 +128037,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -127903,7 +128055,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload @@ -127912,258 +128064,196 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: .LBB74_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 @@ -128172,32 +128262,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -128234,41 +128318,41 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 @@ -128279,148 +128363,149 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 @@ -128434,19 +128519,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -128460,20 +128545,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -128486,7 +128571,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -128499,7 +128584,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -128508,7 +128593,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB74_4: ; %end @@ -131239,8 +131324,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_cbranch_vccnz .LBB75_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 @@ -131295,7 +131380,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 @@ -131417,12 +131502,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -132337,8 +132422,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -132390,7 +132475,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 @@ -132485,11 +132570,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -132612,11 +132697,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -133401,11 +133486,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -133617,11 +133702,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -135689,8 +135774,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 @@ -135788,14 +135873,13 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -137582,6 +137666,8 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -137658,8 +137744,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 @@ -143923,8 +144007,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 @@ -144022,7 +144106,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -144069,7 +144153,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 @@ -146995,9 +147078,9 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -147047,9 +147130,8 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -147781,8 +147863,8 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -147816,7 +147898,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -148869,26 +148951,25 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 @@ -148901,102 +148982,107 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -149013,11 +149099,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -149047,81 +149129,71 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v35 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v16, v16, v53 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 ; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: v_or_b32_e32 v19, v19, v48 -; SI-NEXT: v_or_b32_e32 v21, v21, v36 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v49 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v39 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v62 ; SI-NEXT: v_or_b32_e32 v2, v2, v61 @@ -149134,13 +149206,15 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v9, v46 ; SI-NEXT: v_or_b32_e32 v10, v10, v45 ; SI-NEXT: v_or_b32_e32 v11, v11, v44 -; SI-NEXT: v_or_b32_e32 v12, v12, v42 -; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 ; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v38 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -149154,63 +149228,79 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; kill: killed $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_or_b32_e32 v31, v31, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: .LBB86_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -149227,69 +149317,68 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v19, v48, v19 -; SI-NEXT: v_or_b32_e32 v21, v36, v21 -; SI-NEXT: v_or_b32_e32 v22, v34, v22 -; SI-NEXT: v_or_b32_e32 v23, v32, v23 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -149306,8 +149395,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_or_b32_e32 v1, v62, v1 ; SI-NEXT: v_or_b32_e32 v2, v61, v2 @@ -149320,28 +149408,25 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v46, v9 ; SI-NEXT: v_or_b32_e32 v10, v45, v10 ; SI-NEXT: v_or_b32_e32 v11, v44, v11 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 ; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v54, v15 -; SI-NEXT: v_or_b32_e32 v18, v49, v18 -; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 @@ -149351,34 +149436,37 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v17, v51, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v34, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_or_b32_e32 v31, v38, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 ; SI-NEXT: .LBB86_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -149398,7 +149486,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16f64: @@ -150727,31 +150815,31 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill @@ -150763,36 +150851,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill @@ -150819,63 +150906,64 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 @@ -150884,27 +150972,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 @@ -150914,35 +151002,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -150954,8 +151042,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -150992,10 +151080,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -151003,15 +151091,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -151023,44 +151111,44 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -151148,276 +151236,280 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v54 ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v5, v6 +; SI-NEXT: v_or_b32_e32 v12, v5, v9 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v5, v10 +; SI-NEXT: v_or_b32_e32 v16, v5, v9 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v5, v10 +; SI-NEXT: v_or_b32_e32 v20, v5, v9 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v5, v10 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_or_b32_e32 v22, v5, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v5, v10 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_or_b32_e32 v24, v5, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v5, v10 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_or_b32_e32 v28, v5, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v5, v10 +; SI-NEXT: v_or_b32_e32 v29, v5, v9 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload @@ -151502,140 +151594,147 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v5, v10 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v5, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v8, v5, v8 +; SI-NEXT: v_or_b32_e32 v6, v5, v6 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v5, v10 +; SI-NEXT: v_or_b32_e32 v30, v5, v9 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v57, v9 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v9, v5, v26 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_or_b32_e32 v19, v5, v26 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 @@ -151644,43 +151743,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v7, v5, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v19, v57, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v16, v13, v61 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v23, v62, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v3, v17 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v23, v17, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v3, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -151772,78 +151843,96 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v11, v41, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v15, v5, v44 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v59 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v5, v45, v5 -; SI-NEXT: v_or_b32_e32 v17, v63, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: v_or_b32_e32 v17, v4, v3 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v7, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v11, v41, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v8, v5, v44 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v14, v13, v61 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v45, v5 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB88_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB88_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v59 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 @@ -151866,359 +151955,358 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v54 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v9, v57, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v31 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v10, v38, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_or_b32_e32 v11, v41, v11 +; SI-NEXT: v_or_b32_e32 v11, v51, v11 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v21 -; SI-NEXT: v_or_b32_e32 v11, v38, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; SI-NEXT: v_or_b32_e32 v12, v51, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v12, v12, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_or_b32_e32 v13, v29, v13 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v9, v9, v14 -; SI-NEXT: v_or_b32_e32 v14, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 -; SI-NEXT: v_or_b32_e32 v15, v9, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v16, v9 -; SI-NEXT: v_or_b32_e32 v16, v9, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v17, v8, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 +; SI-NEXT: v_or_b32_e32 v16, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v18, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_or_b32_e32 v17, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v19, v8, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v18, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v20, v8, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v21, v7, v1 +; SI-NEXT: v_or_b32_e32 v7, v19, v7 +; SI-NEXT: v_or_b32_e32 v19, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v20, v6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v21 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v22, v7, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v21, v6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v23, v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v22, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v24, v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v23, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v25, v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v24, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v26, v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v25, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v7, v1 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v28, v7, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v6, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v26 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v28, v23 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload @@ -152229,7 +152317,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -152258,7 +152346,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -152294,7 +152382,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 @@ -152330,7 +152418,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 @@ -152368,7 +152456,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 @@ -152378,27 +152466,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 @@ -152409,78 +152497,78 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152498,7 +152586,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152509,7 +152597,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152518,8 +152606,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152529,7 +152617,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152538,8 +152626,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152549,7 +152637,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152558,8 +152646,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152569,7 +152657,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152578,8 +152666,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152589,7 +152677,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152599,7 +152687,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152618,7 +152706,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152629,7 +152717,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152638,8 +152726,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152649,8 +152737,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152658,8 +152746,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152670,7 +152758,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152678,7 +152766,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -152689,8 +152777,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152698,8 +152786,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152710,7 +152798,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152730,7 +152818,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -152749,35 +152837,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -152852,39 +152940,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 @@ -152898,46 +152986,47 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill @@ -152954,7 +153043,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 @@ -152962,15 +153051,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) @@ -152993,20 +153082,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -153019,20 +153108,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -153040,17 +153129,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -153066,17 +153155,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -153091,44 +153180,44 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -153154,31 +153243,31 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -153197,35 +153286,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -153234,26 +153323,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -153262,35 +153351,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -153305,20 +153394,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -153326,189 +153415,189 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -153613,7 +153702,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: .LBB88_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB88_4 @@ -153621,53 +153710,51 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -153677,19 +153764,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload @@ -153697,14 +153784,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -153721,7 +153807,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) @@ -153729,7 +153815,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: v_or_b32_e32 v28, v28, v32 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -153737,79 +153823,94 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: v_or_b32_e32 v26, v26, v34 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 3, v62 @@ -153818,30 +153919,30 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: v_add_u16_e32 v10, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v58 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -153850,7 +153951,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -153859,7 +153960,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -153871,35 +153972,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -153908,54 +154009,43 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v40 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload @@ -153977,7 +154067,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -154079,17 +154169,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 @@ -154097,81 +154186,81 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill @@ -154179,7 +154268,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill @@ -154196,7 +154285,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -154205,15 +154294,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -154237,20 +154326,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -154264,20 +154353,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -154286,17 +154375,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -154323,7 +154412,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -154343,48 +154432,48 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -154393,9 +154482,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 @@ -154424,10 +154513,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 @@ -154438,7 +154527,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -154453,93 +154542,93 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -154554,20 +154643,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 @@ -154575,58 +154664,58 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload @@ -154644,22 +154733,22 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; kill: killed $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -154756,7 +154845,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -154868,27 +154957,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB88_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload @@ -154896,19 +154985,20 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload @@ -154933,7 +155023,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 @@ -154959,7 +155049,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -154983,11 +155073,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 ; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 ; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 @@ -154995,16 +155085,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) @@ -155014,7 +155105,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload @@ -155024,7 +155115,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -155037,45 +155128,45 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 @@ -155084,63 +155175,63 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 ; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 ; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 @@ -155148,7 +155239,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -155158,14 +155249,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -155175,7 +155266,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -155188,7 +155279,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -155196,7 +155287,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 ; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -155204,14 +155295,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 ; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -155219,7 +155310,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -159129,17 +159220,17 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_mov_b32_e32 v45, v62 ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 ; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload @@ -160376,8 +160467,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -160410,7 +160501,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -160460,9 +160551,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -160495,11 +160586,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -160510,11 +160601,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -160645,9 +160736,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -160724,22 +160815,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 @@ -164579,8 +164669,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -165758,13 +165848,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 @@ -165856,13 +165947,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 @@ -165882,13 +165974,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 @@ -165908,13 +166001,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 @@ -165934,13 +166028,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 @@ -165959,13 +166054,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 @@ -165984,13 +166080,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 @@ -166007,13 +166104,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 @@ -166033,13 +166131,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 @@ -166117,8 +166216,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v46, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -166303,7 +166402,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; kill: killed $vgpr15 @@ -166328,7 +166427,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 @@ -166757,7 +166855,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc ; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7 -; GFX9-NEXT: s_waitcnt vmcnt(52) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 @@ -178671,15 +178768,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v32, v30 ; SI-NEXT: v_mov_b32_e32 v44, v19 ; SI-NEXT: v_mov_b32_e32 v43, v17 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v14, v12 +; SI-NEXT: v_mov_b32_e32 v12, v10 ; SI-NEXT: v_mov_b32_e32 v41, v7 ; SI-NEXT: v_mov_b32_e32 v55, v5 ; SI-NEXT: v_mov_b32_e32 v54, v3 ; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_mov_b32_e32 v30, v0 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:392 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 @@ -178689,13 +178789,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 @@ -178706,152 +178806,147 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill @@ -178871,7 +178966,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -178882,13 +178977,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill @@ -178929,15 +179024,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill @@ -178980,7 +179075,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -178989,7 +179084,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -178997,35 +179091,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -179040,14 +179134,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -179152,75 +179248,83 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v39, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v40 ; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 ; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 ; SI-NEXT: v_or_b32_e32 v45, v46, v45 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 ; SI-NEXT: v_or_b32_e32 v56, v56, v61 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v2, v6 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v2, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v2, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v2, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v8, v7 +; SI-NEXT: v_mov_b32_e32 v7, v19 +; SI-NEXT: v_or_b32_e32 v19, v2, v32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v31, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v45 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v23, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 @@ -179228,11 +179332,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v29, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) @@ -179244,231 +179348,227 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v2, v0 +; SI-NEXT: v_or_b32_e32 v27, v2, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v0 +; SI-NEXT: v_or_b32_e32 v29, v2, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v2, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v2, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v5, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v4, v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v33, v6, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v35, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v10, v8, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v8, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v14, v8, v53 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v16, v8, v40 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v12, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v20, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v24, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v24, v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v26, v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v30, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v34, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v22, v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v32, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v37, v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v39, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v39, v39, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v7, v7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v48, v48, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v49, v49, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -179478,21 +179578,21 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v50, v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v9, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload @@ -179505,24 +179605,24 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v52, v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v53, v53, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload @@ -179531,7 +179631,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v55, v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload @@ -179553,29 +179653,33 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v40, v40, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v42, v42, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v43, v43, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_or_b32_e32 v15, v15, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v34, v15 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 @@ -179583,96 +179687,69 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v44, v44, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v46, v46, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v58, v58, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v58 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v59 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v60, v60, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v60 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v21, v61, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v61, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v47 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v61, v61, v63 -; SI-NEXT: v_or_b32_e32 v3, v47, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -179820,68 +179897,92 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: .LBB92_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v4, v3, v2 +; SI-NEXT: v_or_b32_e32 v5, v3, v2 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v5, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 -; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_mov_b32_e32 v17, v43 ; SI-NEXT: v_mov_b32_e32 v19, v44 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 ; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload @@ -179893,17 +179994,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v25, v47, v2 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v25, v25, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v27, v62, v2 +; SI-NEXT: v_or_b32_e32 v31, v62, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v29, v61, v2 +; SI-NEXT: v_or_b32_e32 v33, v61, v2 ; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 ; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 @@ -179914,56 +180013,66 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v31, v60, v2 +; SI-NEXT: v_or_b32_e32 v35, v60, v2 ; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v60 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v33, v59, v2 +; SI-NEXT: v_or_b32_e32 v37, v59, v2 ; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v11, v12, v59 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v27 -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_or_b32_e32 v14, v14, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v34, v58, v2 +; SI-NEXT: v_or_b32_e32 v38, v58, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v36, v45, v2 +; SI-NEXT: v_or_b32_e32 v39, v45, v2 ; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 ; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v32, v32, v58 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 @@ -179971,7 +180080,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -179980,7 +180089,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -179989,7 +180098,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -179998,618 +180107,622 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v0, v2 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v20, v0, v20 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v0, v16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v0, v8 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v40, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v53, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_or_b32_e32 v7, v7, v52 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v7, v7, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v32, v32, v42 -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v48 -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v19, v0, v47 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v34 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v0, v56 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v33 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v57 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v31 -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v0, v58 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v29 -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_or_b32_e32 v0, v2, v61 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v8, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v6, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v35 ; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v63, v0, v63 -; SI-NEXT: v_or_b32_e32 v0, v1, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v30, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload @@ -180619,7 +180732,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v10 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload @@ -180630,7 +180743,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v10 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload @@ -180645,7 +180758,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180656,7 +180769,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180667,7 +180780,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180678,7 +180791,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180688,8 +180801,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180698,9 +180811,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180709,9 +180822,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180721,8 +180834,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180731,9 +180844,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180742,9 +180855,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180754,8 +180867,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180764,9 +180877,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180775,9 +180888,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180787,8 +180900,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180797,9 +180910,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180808,9 +180921,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180820,8 +180933,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180830,9 +180943,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180841,9 +180954,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180853,8 +180966,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -180863,62 +180976,64 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v10 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -180993,39 +181108,39 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 @@ -181039,46 +181154,47 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill @@ -181095,7 +181211,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 @@ -181103,15 +181219,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) @@ -181134,20 +181250,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -181160,20 +181276,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -181181,17 +181297,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -181207,17 +181323,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -181232,44 +181348,44 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -181295,31 +181411,31 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -181338,35 +181454,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -181375,26 +181491,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -181403,35 +181519,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -181446,20 +181562,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -181467,189 +181583,189 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -181754,7 +181870,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: .LBB92_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB92_4 @@ -181762,53 +181878,51 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -181818,19 +181932,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload @@ -181838,14 +181952,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -181862,7 +181975,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) @@ -181870,7 +181983,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: v_or_b32_e32 v28, v28, v32 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -181878,79 +181991,94 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: v_or_b32_e32 v26, v26, v34 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 3, v62 @@ -181959,30 +182087,30 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: v_add_u16_e32 v10, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v58 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -181991,7 +182119,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -182000,7 +182128,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -182012,35 +182140,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -182049,54 +182177,43 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v40 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload @@ -182118,7 +182235,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -182220,17 +182337,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 @@ -182238,81 +182354,81 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill @@ -182320,7 +182436,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill @@ -182337,7 +182453,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -182346,15 +182462,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -182378,20 +182494,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -182405,20 +182521,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -182427,17 +182543,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -182464,7 +182580,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -182484,48 +182600,48 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -182534,9 +182650,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 @@ -182565,10 +182681,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 @@ -182579,7 +182695,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -182594,93 +182710,93 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -182695,20 +182811,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 @@ -182716,58 +182832,58 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload @@ -182785,22 +182901,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; kill: killed $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -182897,7 +183013,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -183009,27 +183125,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB92_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload @@ -183037,19 +183153,20 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload @@ -183074,7 +183191,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 @@ -183100,7 +183217,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -183124,11 +183241,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 ; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 ; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 @@ -183136,16 +183253,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) @@ -183155,7 +183273,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload @@ -183165,7 +183283,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -183178,45 +183296,45 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 @@ -183225,63 +183343,63 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 ; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 ; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 @@ -183289,7 +183407,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -183299,14 +183417,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -183316,7 +183434,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -183329,7 +183447,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -183337,7 +183455,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 ; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -183345,14 +183463,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 ; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -183360,7 +183478,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -187174,17 +187292,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v45, v62 ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 ; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload @@ -188421,8 +188539,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -188455,7 +188573,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -188505,9 +188623,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -188540,11 +188658,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -188555,11 +188673,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -188690,9 +188808,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -188769,22 +188887,21 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 @@ -191564,8 +191681,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v28, v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v27, v3, v1 @@ -191629,8 +191746,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v37, v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v36, v3, v1 @@ -191694,8 +191811,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v51, v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v50, v3, v1 @@ -193129,8 +193246,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46] ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 @@ -193648,8 +193765,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 @@ -193774,7 +193891,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -193893,7 +194010,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 @@ -194032,7 +194148,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] @@ -194448,13 +194563,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -194473,13 +194589,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -194498,13 +194615,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -194523,13 +194641,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -194548,13 +194667,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -198826,13 +198946,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 @@ -199049,12 +199170,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -202519,19 +202640,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 @@ -202552,16 +202673,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -202569,266 +202687,254 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:360 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:360 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:332 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v6 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 @@ -202838,123 +202944,135 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:384 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -202964,23 +203082,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -203024,29 +203142,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v20, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: v_or_b32_e32 v42, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v2, v1 +; SI-NEXT: v_or_b32_e32 v5, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -203056,848 +203172,855 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v2, v1 +; SI-NEXT: v_or_b32_e32 v11, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v1, v2 +; SI-NEXT: v_or_b32_e32 v40, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v55, v24, v1 +; SI-NEXT: v_or_b32_e32 v10, v24, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v24, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: v_or_b32_e32 v15, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v26, v30, v1 +; SI-NEXT: v_or_b32_e32 v18, v25, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v21, v2, v1 +; SI-NEXT: v_or_b32_e32 v19, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v2, v1 +; SI-NEXT: v_or_b32_e32 v27, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: v_or_b32_e32 v38, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v53, v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v1, v4 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v55, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v43, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v4, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v17, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v14, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v56, v4, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v44, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v61, v4, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v45, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v10, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v25, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v4, v4, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v47, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v10, v22 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v58, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v10, v22 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v10, v29 -; SI-NEXT: v_or_b32_e32 v29, v22, v34 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v59, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v62, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v1, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v1, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v1, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v37, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v37, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v37, v51, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v51, v22, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v22, v63 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v56, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_or_b32_e32 v47, v59, v22 +; SI-NEXT: v_or_b32_e32 v56, v60, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v54, v22, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v19, v8 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v14, v14, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v22, v4 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v9, v9, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v22, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v22, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v22, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v22, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v22, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v10, v22, v10 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v38, v22, v24 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v48, v22, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v22, v24 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v22, v24 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v46, v22, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v22, v24 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v40, v22, v40 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v22, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v22, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v53, v22, v53 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v15, v22, v15 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v45 -; SI-NEXT: v_or_b32_e32 v45, v22, v62 -; SI-NEXT: v_mov_b32_e32 v62, v18 -; SI-NEXT: v_or_b32_e32 v20, v20, v62 -; SI-NEXT: v_or_b32_e32 v22, v19, v3 -; SI-NEXT: v_alignbit_b32 v3, v20, v3, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v42 +; SI-NEXT: v_or_b32_e32 v31, v22, v31 +; SI-NEXT: v_or_b32_e32 v22, v12, v61 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v43, v12, v5 +; SI-NEXT: v_alignbit_b32 v5, v22, v5, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v32, v32, v59 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v3, v19 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v3, v9 -; SI-NEXT: v_alignbit_b32 v3, v5, v9, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v41, v3, v55 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v18, v3, v13 -; SI-NEXT: v_alignbit_b32 v3, v41, v13, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_alignbit_b32 v11, v5, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v42, v11, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v40, v11, v15 +; SI-NEXT: v_alignbit_b32 v11, v42, v15, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v11, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v39, v11, v19 +; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v13, v3, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v39, v3, v21 -; SI-NEXT: v_alignbit_b32 v3, v13, v21, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v32, v3, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v42, v3, v27 -; SI-NEXT: v_alignbit_b32 v3, v32, v27, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v28, v11, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v11, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v21, v11, v27 +; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v27, v3, v35 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v31 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v27, v31, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v21, v3, v48 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v31, v3, v50 -; SI-NEXT: v_alignbit_b32 v3, v21, v50, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v3, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v11, v53, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v43 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v3, v43, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v9, v6, v16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v9, v46, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v63, v6, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v6, v6, v57 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v19, v11, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v27, v11, v52 +; SI-NEXT: v_alignbit_b32 v11, v19, v52, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v2 +; SI-NEXT: v_alignbit_b32 v1, v11, v55, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v63, v57, 16 -; SI-NEXT: v_or_b32_e32 v57, v4, v23 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v52, v4, v33 -; SI-NEXT: v_alignbit_b32 v4, v57, v33, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v15, v1, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v46, v4, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v38, v4, v37 -; SI-NEXT: v_alignbit_b32 v4, v46, v37, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v44, v4, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_or_b32_e32 v36, v4, v47 -; SI-NEXT: v_alignbit_b32 v4, v44, v47, 16 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v15, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v54 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_or_b32_e32 v43, v4, v8 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_or_b32_e32 v12, v4, v14 -; SI-NEXT: v_alignbit_b32 v4, v43, v14, 16 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v17, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_mov_b32_e32 v23, v12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v47, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v61 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_or_b32_e32 v61, v6, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_or_b32_e32 v58, v6, v49 -; SI-NEXT: v_alignbit_b32 v6, v61, v49, 16 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_alignbit_b32 v32, v1, v59, 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v59, v6, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v62, v32, v24 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v50, v6, v36 +; SI-NEXT: v_alignbit_b32 v6, v59, v36, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v47, v6, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v49, v6, v37 +; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v45, v6, v34 +; SI-NEXT: v_or_b32_e32 v48, v3, v56 +; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v44, v3, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v46, v32, v13 +; SI-NEXT: v_alignbit_b32 v13, v62, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v44, v9, 16 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 -; SI-NEXT: v_or_b32_e32 v2, v8, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_mov_b32_e32 v30, v16 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 -; SI-NEXT: v_mov_b32_e32 v28, v24 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v24 +; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_or_b32_e32 v3, v15, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_or_b32_e32 v4, v59, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v60, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v32 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v5 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 +; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v6 -; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v3 -; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -203906,16 +204029,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v50, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -203924,15 +204047,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v57, vcc, s7, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -203941,16 +204064,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 +; SI-NEXT: v_mov_b32_e32 v46, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -203959,9 +204082,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 @@ -203978,7 +204101,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 @@ -203993,15 +204116,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v63, vcc, s7, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -204010,50 +204133,45 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -204063,9 +204181,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 @@ -204079,468 +204196,476 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v17, v20, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v17, v22, v17 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v17, v23, v17 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v23 -; SI-NEXT: v_mov_b32_e32 v23, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_or_b32_e32 v23, v26, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v24, v24, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v25, v25, v21 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v24, v24, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v25, v25, v17 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 +; SI-NEXT: v_mov_b32_e32 v14, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v30, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v41, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v42, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v1 -; SI-NEXT: v_alignbit_b32 v1, v20, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v5, v25, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v2 +; SI-NEXT: v_alignbit_b32 v2, v22, v43, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v5, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v13, v39, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v42, v40, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v32, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v26, v39, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v27, v15, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v28, v18, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v29, v21, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v11, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v11, v16, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v3, v12, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v9, v10, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v63, v8, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v61, v7, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v57, v6, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v46, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v47, v33, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v45, v32, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v43, v23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v44, v14, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -204549,14 +204674,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -204564,77 +204689,74 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -204711,39 +204833,39 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 @@ -204757,46 +204879,47 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill @@ -204813,7 +204936,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 @@ -204821,15 +204944,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) @@ -204852,20 +204975,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -204878,20 +205001,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -204899,17 +205022,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -204925,17 +205048,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -204950,44 +205073,44 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -205013,31 +205136,31 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -205056,35 +205179,35 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -205093,26 +205216,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -205121,35 +205244,35 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -205164,20 +205287,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -205185,189 +205308,189 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; kill: killed $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -205472,7 +205595,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 @@ -205480,53 +205603,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -205536,19 +205657,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload @@ -205556,14 +205677,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -205580,7 +205700,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) @@ -205588,7 +205708,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: v_or_b32_e32 v28, v28, v32 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -205596,79 +205716,94 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: v_or_b32_e32 v26, v26, v34 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 3, v62 @@ -205677,30 +205812,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: v_add_u16_e32 v10, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: v_add_u16_e32 v11, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_add_u16_e32 v12, 3, v58 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -205709,7 +205844,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -205718,7 +205853,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -205730,35 +205865,35 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -205767,54 +205902,43 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v40 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 ; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload @@ -205836,7 +205960,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v39, 3, v39 ; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -205938,17 +206062,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 @@ -205956,81 +206079,81 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 ; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill @@ -206038,7 +206161,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill @@ -206055,7 +206178,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -206064,15 +206187,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -206096,20 +206219,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -206123,20 +206246,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -206145,17 +206268,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 @@ -206182,7 +206305,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 @@ -206202,48 +206325,48 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -206252,9 +206375,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 @@ -206283,10 +206406,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 @@ -206297,7 +206420,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206312,93 +206435,93 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206413,20 +206536,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 @@ -206434,58 +206557,58 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload @@ -206503,22 +206626,22 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; kill: killed $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr37 @@ -206615,7 +206738,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206727,27 +206850,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload @@ -206755,19 +206878,20 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload @@ -206792,7 +206916,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 @@ -206818,7 +206942,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -206842,11 +206966,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 ; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 ; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 @@ -206854,16 +206978,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(3) @@ -206873,7 +206998,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload @@ -206883,7 +207008,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -206896,45 +207021,45 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 @@ -206943,63 +207068,63 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 ; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 ; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 @@ -207007,7 +207132,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -207017,14 +207142,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -207034,7 +207159,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -207047,7 +207172,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -207055,7 +207180,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 ; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -207063,14 +207188,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 ; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -207078,7 +207203,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 @@ -208842,8 +208967,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill @@ -208859,41 +208984,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v62, s28, 0 +; SI-NEXT: v_writelane_b32 v62, s25, 1 +; SI-NEXT: v_writelane_b32 v62, s24, 2 +; SI-NEXT: v_writelane_b32 v62, s23, 3 +; SI-NEXT: v_writelane_b32 v62, s22, 4 +; SI-NEXT: v_writelane_b32 v62, s21, 5 +; SI-NEXT: v_writelane_b32 v62, s18, 6 +; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -208911,635 +209043,625 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s55, 15 ; SI-NEXT: v_writelane_b32 v63, s64, 16 ; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v62, s28, 0 ; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v62, s27, 1 ; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v62, s25, 2 ; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v62, s24, 3 ; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v62, s23, 4 ; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v62, s22, 5 ; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v62, s21, 6 ; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v62, s20, 7 ; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v62, s19, 8 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v62, s18, 9 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_writelane_b32 v62, s16, 10 ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v29, v5 -; SI-NEXT: v_readfirstlane_b32 s76, v18 -; SI-NEXT: v_readfirstlane_b32 s40, v25 -; SI-NEXT: v_readfirstlane_b32 s16, v24 -; SI-NEXT: v_readfirstlane_b32 s42, v23 -; SI-NEXT: v_readfirstlane_b32 s52, v20 -; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_readfirstlane_b32 s15, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v25 +; SI-NEXT: v_readfirstlane_b32 s43, v15 +; SI-NEXT: v_readfirstlane_b32 s42, v24 +; SI-NEXT: v_readfirstlane_b32 s44, v23 +; SI-NEXT: v_readfirstlane_b32 s49, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s53, v20 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s50, v35 -; SI-NEXT: v_readfirstlane_b32 s31, v36 -; SI-NEXT: v_readfirstlane_b32 s53, v37 -; SI-NEXT: v_readfirstlane_b32 s82, v48 -; SI-NEXT: v_readfirstlane_b32 s7, v49 -; SI-NEXT: v_readfirstlane_b32 s79, v52 -; SI-NEXT: v_readfirstlane_b32 s78, v55 -; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_readfirstlane_b32 s79, v52 +; SI-NEXT: v_readfirstlane_b32 s88, v54 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:136 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 -; SI-NEXT: v_readfirstlane_b32 s4, v45 -; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_readfirstlane_b32 s77, v41 +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v32 +; SI-NEXT: v_readfirstlane_b32 s51, v33 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v56 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: v_readfirstlane_b32 s4, v60 -; SI-NEXT: v_readfirstlane_b32 s86, v31 -; SI-NEXT: v_readfirstlane_b32 s36, v32 -; SI-NEXT: v_readfirstlane_b32 s71, v33 -; SI-NEXT: v_readfirstlane_b32 s77, v59 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v61 +; SI-NEXT: v_readfirstlane_b32 s37, v45 +; SI-NEXT: v_readfirstlane_b32 s24, v56 +; SI-NEXT: v_readfirstlane_b32 s7, v57 +; SI-NEXT: v_readfirstlane_b32 s92, v58 +; SI-NEXT: v_readfirstlane_b32 s28, v59 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: v_readfirstlane_b32 s98, v50 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s35, v43 +; SI-NEXT: v_readfirstlane_b32 s55, v46 +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_readfirstlane_b32 s67, v39 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s67, v53 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: v_writelane_b32 v62, s4, 17 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v62, s4, 18 -; SI-NEXT: v_readfirstlane_b32 s81, v34 -; SI-NEXT: v_readfirstlane_b32 s75, v39 -; SI-NEXT: v_readfirstlane_b32 s68, v42 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 -; SI-NEXT: v_readfirstlane_b32 s51, v54 -; SI-NEXT: v_readfirstlane_b32 s97, v51 -; SI-NEXT: v_readfirstlane_b32 s35, v27 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s28, v28 -; SI-NEXT: v_readfirstlane_b32 s87, v26 +; SI-NEXT: v_readfirstlane_b32 s74, v53 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: v_readfirstlane_b32 s85, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_readfirstlane_b32 s98, v40 +; SI-NEXT: v_readfirstlane_b32 s69, v51 +; SI-NEXT: v_readfirstlane_b32 s21, v36 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_readfirstlane_b32 s23, v28 +; SI-NEXT: v_readfirstlane_b32 s34, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v13 +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_readfirstlane_b32 s97, v29 +; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30 ; SI-NEXT: v_readfirstlane_b32 s96, v17 -; SI-NEXT: v_readfirstlane_b32 s99, v16 -; SI-NEXT: v_readfirstlane_b32 s89, v15 -; SI-NEXT: v_readfirstlane_b32 s88, v12 -; SI-NEXT: v_readfirstlane_b32 s30, v11 -; SI-NEXT: v_readfirstlane_b32 s64, v10 -; SI-NEXT: v_readfirstlane_b32 s55, v9 -; SI-NEXT: v_readfirstlane_b32 s65, v8 -; SI-NEXT: v_readfirstlane_b32 s80, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v2 -; SI-NEXT: v_readfirstlane_b32 s74, v1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s93, v36 -; SI-NEXT: v_readfirstlane_b32 s24, v37 -; SI-NEXT: v_readfirstlane_b32 s27, v48 -; SI-NEXT: v_readfirstlane_b32 s84, v43 -; SI-NEXT: v_readfirstlane_b32 s83, v44 -; SI-NEXT: v_readfirstlane_b32 s85, v46 +; SI-NEXT: v_readfirstlane_b32 s64, v9 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s83, v7 +; SI-NEXT: v_readfirstlane_b32 s84, v4 +; SI-NEXT: v_readfirstlane_b32 s93, v3 +; SI-NEXT: v_readfirstlane_b32 s76, v1 +; SI-NEXT: v_readfirstlane_b32 s58, v38 +; SI-NEXT: v_readfirstlane_b32 s65, v49 +; SI-NEXT: v_readfirstlane_b32 s62, v54 +; SI-NEXT: v_readfirstlane_b32 s81, v44 +; SI-NEXT: v_readfirstlane_b32 s71, v47 +; SI-NEXT: v_readfirstlane_b32 s38, v60 +; SI-NEXT: v_readfirstlane_b32 s86, v61 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s20, v47 -; SI-NEXT: v_readfirstlane_b32 s4, v58 -; SI-NEXT: v_writelane_b32 v62, s4, 19 -; SI-NEXT: v_readfirstlane_b32 s23, v49 -; SI-NEXT: v_readfirstlane_b32 s92, v52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: v_readfirstlane_b32 s90, v50 +; SI-NEXT: v_readfirstlane_b32 s31, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 -; SI-NEXT: v_readfirstlane_b32 s90, v35 -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: v_readfirstlane_b32 s38, v32 -; SI-NEXT: v_readfirstlane_b32 s70, v33 -; SI-NEXT: v_readfirstlane_b32 s54, v59 -; SI-NEXT: v_readfirstlane_b32 s57, v60 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s82, v56 +; SI-NEXT: v_readfirstlane_b32 s95, v57 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s56, v61 -; SI-NEXT: v_readfirstlane_b32 s59, v55 -; SI-NEXT: v_readfirstlane_b32 s61, v41 -; SI-NEXT: v_readfirstlane_b32 s19, v45 -; SI-NEXT: v_readfirstlane_b32 s34, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:252 +; SI-NEXT: v_readfirstlane_b32 s39, v58 +; SI-NEXT: v_readfirstlane_b32 s56, v59 +; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_readfirstlane_b32 s36, v42 +; SI-NEXT: v_readfirstlane_b32 s73, v45 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 -; SI-NEXT: v_writelane_b32 v62, s4, 20 -; SI-NEXT: v_readfirstlane_b32 s25, v53 -; SI-NEXT: v_readfirstlane_b32 s91, v40 -; SI-NEXT: v_readfirstlane_b32 s37, v34 -; SI-NEXT: v_readfirstlane_b32 s47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 +; SI-NEXT: v_readfirstlane_b32 s16, v34 +; SI-NEXT: v_readfirstlane_b32 s48, v32 +; SI-NEXT: v_readfirstlane_b32 s52, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s47, v35 +; SI-NEXT: v_readfirstlane_b32 s60, v37 +; SI-NEXT: v_readfirstlane_b32 s61, v39 +; SI-NEXT: v_readfirstlane_b32 s89, v43 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s46, v57 -; SI-NEXT: v_readfirstlane_b32 s22, v42 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 -; SI-NEXT: v_readfirstlane_b32 s72, v39 -; SI-NEXT: v_readfirstlane_b32 s94, v51 -; SI-NEXT: v_readfirstlane_b32 s48, v54 -; SI-NEXT: v_readfirstlane_b32 s66, v43 -; SI-NEXT: v_readfirstlane_b32 s69, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s45, v46 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_writelane_b32 v62, s4, 21 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s17, 23 -; SI-NEXT: v_writelane_b32 v62, s40, 24 -; SI-NEXT: v_writelane_b32 v62, s16, 25 -; SI-NEXT: v_writelane_b32 v62, s42, 26 -; SI-NEXT: v_writelane_b32 v62, s46, 27 -; SI-NEXT: v_writelane_b32 v62, s47, 28 -; SI-NEXT: v_writelane_b32 v62, s56, 29 -; SI-NEXT: v_writelane_b32 v62, s57, 30 -; SI-NEXT: v_writelane_b32 v62, s45, 31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s58, v52 -; SI-NEXT: v_writelane_b32 v62, s49, 32 -; SI-NEXT: v_writelane_b32 v62, s58, 33 -; SI-NEXT: v_writelane_b32 v62, s59, 34 -; SI-NEXT: v_writelane_b32 v62, s52, 35 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s60, v38 +; SI-NEXT: v_readfirstlane_b32 s99, v46 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_writelane_b32 v62, s60, 36 -; SI-NEXT: v_writelane_b32 v62, s61, 37 -; SI-NEXT: v_writelane_b32 v62, s93, 38 -; SI-NEXT: v_writelane_b32 v62, s8, 39 -; SI-NEXT: v_readfirstlane_b32 s62, v47 -; SI-NEXT: v_writelane_b32 v62, s72, 40 -; SI-NEXT: v_readfirstlane_b32 s73, v58 -; SI-NEXT: v_writelane_b32 v62, s62, 41 -; SI-NEXT: v_writelane_b32 v62, s73, 42 -; SI-NEXT: v_writelane_b32 v62, s35, 43 -; SI-NEXT: v_writelane_b32 v62, s94, 44 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; SI-NEXT: v_readfirstlane_b32 s54, v48 +; SI-NEXT: v_readfirstlane_b32 s50, v53 +; SI-NEXT: v_readfirstlane_b32 s78, v49 +; SI-NEXT: v_readfirstlane_b32 s30, v51 +; SI-NEXT: v_readfirstlane_b32 s66, v54 +; SI-NEXT: v_readfirstlane_b32 s91, v40 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s6, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s17, 17 +; SI-NEXT: v_writelane_b32 v62, s15, 18 +; SI-NEXT: v_writelane_b32 v62, s18, 19 +; SI-NEXT: v_writelane_b32 v62, s43, 20 +; SI-NEXT: v_writelane_b32 v62, s42, 21 +; SI-NEXT: v_writelane_b32 v62, s44, 22 +; SI-NEXT: v_writelane_b32 v62, s16, 23 +; SI-NEXT: v_writelane_b32 v62, s49, 24 +; SI-NEXT: v_writelane_b32 v62, s8, 25 +; SI-NEXT: v_writelane_b32 v62, s6, 26 +; SI-NEXT: v_readfirstlane_b32 s45, v52 +; SI-NEXT: v_writelane_b32 v62, s56, 27 +; SI-NEXT: v_writelane_b32 v62, s45, 28 +; SI-NEXT: v_writelane_b32 v62, s53, 29 +; SI-NEXT: v_writelane_b32 v62, s94, 30 +; SI-NEXT: v_writelane_b32 v62, s57, 31 +; SI-NEXT: v_writelane_b32 v62, s58, 32 +; SI-NEXT: v_writelane_b32 v62, s47, 33 +; SI-NEXT: v_readfirstlane_b32 s46, v55 +; SI-NEXT: v_writelane_b32 v62, s40, 34 +; SI-NEXT: v_readfirstlane_b32 s59, v47 +; SI-NEXT: v_writelane_b32 v62, s46, 35 +; SI-NEXT: v_writelane_b32 v62, s59, 36 +; SI-NEXT: v_writelane_b32 v62, s60, 37 +; SI-NEXT: v_writelane_b32 v62, s36, 38 +; SI-NEXT: v_writelane_b32 v62, s65, 39 +; SI-NEXT: v_writelane_b32 v62, s61, 40 +; SI-NEXT: v_writelane_b32 v62, s73, 41 +; SI-NEXT: v_writelane_b32 v62, s62, 42 +; SI-NEXT: v_writelane_b32 v62, s72, 43 +; SI-NEXT: v_writelane_b32 v62, s23, 44 ; SI-NEXT: v_writelane_b32 v62, s48, 45 -; SI-NEXT: v_writelane_b32 v62, s91, 46 +; SI-NEXT: v_writelane_b32 v62, s34, 46 +; SI-NEXT: v_writelane_b32 v62, s78, 47 +; SI-NEXT: v_writelane_b32 v62, s30, 48 +; SI-NEXT: v_writelane_b32 v62, s54, 49 +; SI-NEXT: v_writelane_b32 v62, s50, 50 +; SI-NEXT: v_writelane_b32 v62, s52, 51 +; SI-NEXT: v_writelane_b32 v62, s82, 52 +; SI-NEXT: v_writelane_b32 v62, s66, 53 +; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v59 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v45 -; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v55 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v50 -; SI-NEXT: v_writelane_b32 v62, s66, 47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v18 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB97_2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31 +; SI-NEXT: v_writelane_b32 v62, s91, 54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: v_readlane_b32 s5, v62, 6 -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v5, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_writelane_b32 v62, s4, 48 -; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: v_writelane_b32 v62, s4, 55 +; SI-NEXT: v_readlane_b32 s4, v62, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: v_readlane_b32 s5, v62, 3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s63, s5, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: v_readlane_b32 s4, v62, 6 ; SI-NEXT: s_and_b32 s5, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 8 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s4, 24 +; SI-NEXT: s_lshl_b32 s9, s19, 24 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: s_or_b32 s9, s9, s5 ; SI-NEXT: s_and_b32 s5, s4, 0xff ; SI-NEXT: s_lshl_b32 s10, s29, 8 ; SI-NEXT: s_or_b32 s4, s5, s10 -; SI-NEXT: v_writelane_b32 v62, s4, 49 -; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_readlane_b32 s10, v62, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_writelane_b32 v62, s54, 50 -; SI-NEXT: s_lshl_b32 s11, s21, 24 -; SI-NEXT: s_mov_b32 s18, s22 -; SI-NEXT: s_mov_b32 s22, s21 -; SI-NEXT: s_or_b32 s21, s11, s5 +; SI-NEXT: s_lshl_b32 s11, s10, 24 +; SI-NEXT: s_or_b32 s5, s11, s5 ; SI-NEXT: s_and_b32 s11, s26, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s4, 24 +; SI-NEXT: s_lshl_b32 s12, s27, 24 ; SI-NEXT: s_or_b32 s14, s12, s11 -; SI-NEXT: s_and_b32 s11, s80, 0xff -; SI-NEXT: s_lshl_b32 s12, s65, 8 -; SI-NEXT: s_or_b32 s12, s11, s12 -; SI-NEXT: s_and_b32 s11, s55, 0xff +; SI-NEXT: s_and_b32 s11, s83, 0xff +; SI-NEXT: s_lshl_b32 s12, s25, 8 +; SI-NEXT: s_or_b32 s10, s11, s12 +; SI-NEXT: v_writelane_b32 v62, s10, 57 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: v_readlane_b32 s10, v62, 15 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s13, s64, 24 +; SI-NEXT: s_lshl_b32 s13, s10, 24 ; SI-NEXT: s_or_b32 s41, s13, s11 -; SI-NEXT: s_and_b32 s11, s89, 0xff -; SI-NEXT: s_lshl_b32 s13, s99, 8 -; SI-NEXT: s_or_b32 s13, s11, s13 +; SI-NEXT: s_and_b32 s11, s43, 0xff +; SI-NEXT: s_lshl_b32 s13, s15, 8 +; SI-NEXT: s_or_b32 s10, s11, s13 ; SI-NEXT: s_and_b32 s11, s96, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s76, 24 +; SI-NEXT: s_lshl_b32 s15, s80, 24 ; SI-NEXT: s_or_b32 s43, s15, s11 -; SI-NEXT: s_and_b32 s11, s42, 0xff -; SI-NEXT: s_lshl_b32 s15, s16, 8 -; SI-NEXT: s_or_b32 s16, s11, s15 -; SI-NEXT: s_and_b32 s11, s40, 0xff +; SI-NEXT: s_and_b32 s11, s44, 0xff +; SI-NEXT: s_lshl_b32 s15, s42, 8 +; SI-NEXT: s_or_b32 s13, s11, s15 +; SI-NEXT: s_and_b32 s11, s18, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s87, 24 +; SI-NEXT: s_lshl_b32 s15, s97, 24 ; SI-NEXT: s_or_b32 s44, s15, s11 -; SI-NEXT: s_and_b32 s11, s73, 0xff -; SI-NEXT: s_lshl_b32 s15, s62, 8 -; SI-NEXT: s_or_b32 s62, s11, s15 -; SI-NEXT: s_and_b32 s11, s58, 0xff +; SI-NEXT: s_and_b32 s11, s59, 0xff +; SI-NEXT: s_lshl_b32 s15, s46, 8 +; SI-NEXT: s_or_b32 s12, s11, s15 +; SI-NEXT: s_and_b32 s11, s45, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s45, 24 +; SI-NEXT: s_lshl_b32 s15, s6, 24 ; SI-NEXT: s_or_b32 s45, s15, s11 -; SI-NEXT: s_and_b32 s11, s48, 0xff -; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: s_and_b32 s11, s30, 0xff +; SI-NEXT: s_lshl_b32 s15, s78, 8 +; SI-NEXT: v_writelane_b32 v62, s10, 58 ; SI-NEXT: s_or_b32 s10, s11, s15 -; SI-NEXT: s_and_b32 s11, s46, 0xff +; SI-NEXT: s_and_b32 s11, s99, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s47, 24 +; SI-NEXT: s_lshl_b32 s15, s89, 24 ; SI-NEXT: s_or_b32 s46, s15, s11 -; SI-NEXT: s_and_b32 s11, s25, 0xff -; SI-NEXT: s_lshl_b32 s15, s34, 8 -; SI-NEXT: s_or_b32 s94, s11, s15 -; SI-NEXT: s_and_b32 s11, s72, 0xff +; SI-NEXT: s_and_b32 s11, s61, 0xff +; SI-NEXT: s_lshl_b32 s15, s60, 8 +; SI-NEXT: s_or_b32 s6, s11, s15 +; SI-NEXT: s_and_b32 s11, s22, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s60, 24 +; SI-NEXT: s_lshl_b32 s15, s47, 24 ; SI-NEXT: s_or_b32 s47, s15, s11 -; SI-NEXT: s_and_b32 s11, s61, 0xff -; SI-NEXT: s_lshl_b32 s15, s59, 8 -; SI-NEXT: s_or_b32 s73, s11, s15 -; SI-NEXT: s_and_b32 s11, s56, 0xff +; SI-NEXT: s_and_b32 s11, s57, 0xff +; SI-NEXT: s_lshl_b32 s15, s56, 8 +; SI-NEXT: v_writelane_b32 v62, s6, 59 +; SI-NEXT: s_or_b32 s6, s11, s15 +; SI-NEXT: s_and_b32 s11, s39, 0xff +; SI-NEXT: v_writelane_b32 v62, s6, 60 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s57, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 20 +; SI-NEXT: s_lshl_b32 s15, s95, 24 ; SI-NEXT: s_or_b32 s56, s15, s11 -; SI-NEXT: s_and_b32 s11, s38, 0xff -; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: s_and_b32 s11, s48, 0xff +; SI-NEXT: s_lshl_b32 s15, s72, 8 +; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_or_b32 s48, s11, s15 -; SI-NEXT: s_and_b32 s11, s92, 0xff +; SI-NEXT: s_and_b32 s11, s6, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s23, 24 +; SI-NEXT: s_lshl_b32 s15, s31, 24 ; SI-NEXT: s_or_b32 vcc_lo, s15, s11 -; SI-NEXT: s_and_b32 s11, s20, 0xff -; SI-NEXT: s_lshl_b32 s15, s85, 8 +; SI-NEXT: s_and_b32 s11, s86, 0xff +; SI-NEXT: s_lshl_b32 s15, s38, 8 ; SI-NEXT: s_or_b32 s72, s11, s15 -; SI-NEXT: s_and_b32 s11, s83, 0xff +; SI-NEXT: s_and_b32 s11, s71, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s84, 24 +; SI-NEXT: s_lshl_b32 s15, s81, 24 ; SI-NEXT: s_or_b32 vcc_hi, s15, s11 -; SI-NEXT: s_and_b32 s11, s93, 0xff -; SI-NEXT: s_lshl_b32 s15, s97, 8 +; SI-NEXT: s_and_b32 s11, s58, 0xff +; SI-NEXT: s_lshl_b32 s15, s85, 8 ; SI-NEXT: s_or_b32 s57, s11, s15 -; SI-NEXT: s_and_b32 s11, s67, 0xff +; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s51, 24 -; SI-NEXT: v_writelane_b32 v62, s67, 51 -; SI-NEXT: s_mov_b32 s67, s51 -; SI-NEXT: s_mov_b32 s51, s74 +; SI-NEXT: s_lshl_b32 s15, s74, 24 +; SI-NEXT: v_writelane_b32 v62, s74, 61 ; SI-NEXT: s_or_b32 s74, s15, s11 -; SI-NEXT: s_and_b32 s11, s98, 0xff -; SI-NEXT: s_lshl_b32 s15, s75, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 18 -; SI-NEXT: v_writelane_b32 v62, s87, 52 +; SI-NEXT: s_and_b32 s11, s87, 0xff +; SI-NEXT: s_lshl_b32 s15, s21, 8 ; SI-NEXT: s_or_b32 s58, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_writelane_b32 v62, s25, 53 +; SI-NEXT: s_and_b32 s11, s68, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s81, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: s_mov_b32 s54, s75 +; SI-NEXT: s_lshl_b32 s15, s28, 24 ; SI-NEXT: s_or_b32 s75, s15, s11 -; SI-NEXT: s_and_b32 s11, s77, 0xff -; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_lshl_b32 s15, s55, 8 +; SI-NEXT: v_writelane_b32 v62, s25, 62 ; SI-NEXT: s_or_b32 s59, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: s_and_b32 s11, s37, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: s_mov_b32 s95, s69 -; SI-NEXT: s_mov_b32 s69, s76 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: s_mov_b32 s18, s21 +; SI-NEXT: s_mov_b32 s21, s97 +; SI-NEXT: s_mov_b32 s97, s37 +; SI-NEXT: s_mov_b32 s37, s76 ; SI-NEXT: s_or_b32 s76, s15, s11 -; SI-NEXT: s_and_b32 s11, s86, 0xff +; SI-NEXT: s_and_b32 s11, s35, 0xff ; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 11 ; SI-NEXT: s_or_b32 s60, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: s_and_b32 s11, s77, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 12 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s78, 24 -; SI-NEXT: s_mov_b32 s93, s99 -; SI-NEXT: s_mov_b32 s99, s84 -; SI-NEXT: s_mov_b32 s84, s77 +; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: s_mov_b32 s6, s95 +; SI-NEXT: s_mov_b32 s95, s39 +; SI-NEXT: s_mov_b32 s39, s89 +; SI-NEXT: s_mov_b32 s89, s99 +; SI-NEXT: s_mov_b32 s99, s83 +; SI-NEXT: s_mov_b32 s83, s55 +; SI-NEXT: s_mov_b32 s55, s64 +; SI-NEXT: s_mov_b32 s64, s35 +; SI-NEXT: s_mov_b32 s35, s77 ; SI-NEXT: s_or_b32 s77, s15, s11 -; SI-NEXT: s_and_b32 s11, s82, 0xff -; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 9 ; SI-NEXT: s_or_b32 s61, s11, s15 -; SI-NEXT: s_and_b32 s11, s31, 0xff +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 8 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s50, 24 -; SI-NEXT: s_mov_b32 s4, s85 -; SI-NEXT: s_mov_b32 s85, s83 -; SI-NEXT: s_mov_b32 s83, s82 -; SI-NEXT: s_mov_b32 s82, s53 -; SI-NEXT: s_mov_b32 s53, s50 -; SI-NEXT: s_mov_b32 s50, s31 -; SI-NEXT: s_mov_b32 s31, s78 +; SI-NEXT: s_lshl_b32 s15, s4, 24 ; SI-NEXT: s_or_b32 s78, s15, s11 -; SI-NEXT: v_readlane_b32 s11, v62, 10 +; SI-NEXT: v_readlane_b32 s11, v62, 7 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s15, s17, 8 ; SI-NEXT: s_or_b32 s11, s11, s15 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: v_mov_b32_e32 v51, s9 -; SI-NEXT: s_or_b32 s6, s11, s9 -; SI-NEXT: v_readlane_b32 s9, v62, 3 -; SI-NEXT: v_readlane_b32 s11, v62, 2 +; SI-NEXT: s_or_b32 s17, s11, s9 +; SI-NEXT: v_readlane_b32 s9, v62, 2 +; SI-NEXT: v_readlane_b32 s11, v62, 1 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s15, s11, 8 ; SI-NEXT: s_or_b32 s9, s9, s15 ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s17, s9, s14 -; SI-NEXT: v_readlane_b32 s9, v62, 22 +; SI-NEXT: s_mov_b32 s4, s96 +; SI-NEXT: s_mov_b32 s96, s24 ; SI-NEXT: v_mov_b32_e32 v52, s14 -; SI-NEXT: s_and_b32 s14, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 21 -; SI-NEXT: s_lshl_b32 s15, s9, 8 +; SI-NEXT: s_or_b32 s24, s9, s14 +; SI-NEXT: s_and_b32 s14, s93, 0xff +; SI-NEXT: s_lshl_b32 s15, s84, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v53, v6, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v50, s14, v53 -; SI-NEXT: s_and_b32 s14, s30, 0xff -; SI-NEXT: s_lshl_b32 s15, s88, 8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: s_and_b32 s14, s8, 0xff +; SI-NEXT: s_lshl_b32 s15, s49, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v54, v14, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v17, s14, v54 -; SI-NEXT: s_and_b32 s14, s8, 0xff -; SI-NEXT: s_lshl_b32 s15, s52, 8 +; SI-NEXT: s_and_b32 s14, s40, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v55, v40, v1 +; SI-NEXT: v_or_b32_e32 v55, v18, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v16, s14, v55 -; SI-NEXT: s_and_b32 s14, s35, 0xff -; SI-NEXT: s_lshl_b32 s15, s28, 8 +; SI-NEXT: s_and_b32 s14, s34, 0xff +; SI-NEXT: s_lshl_b32 s15, s23, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v40, v42, v1 +; SI-NEXT: v_or_b32_e32 v40, v19, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v15, s14, v40 -; SI-NEXT: s_and_b32 s14, s95, 0xff +; SI-NEXT: s_and_b32 s14, s91, 0xff ; SI-NEXT: s_lshl_b32 s15, s66, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v41, v61, v1 +; SI-NEXT: v_or_b32_e32 v41, v22, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v12, s14, v41 -; SI-NEXT: s_and_b32 s14, s18, 0xff -; SI-NEXT: s_lshl_b32 s15, s91, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: s_and_b32 s14, s50, 0xff +; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v42, v60, v1 +; SI-NEXT: v_or_b32_e32 v42, v23, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v11, s14, v42 -; SI-NEXT: s_and_b32 s14, s37, 0xff -; SI-NEXT: s_lshl_b32 s15, s19, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_and_b32 s14, s73, 0xff +; SI-NEXT: s_lshl_b32 s15, s36, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_mov_b32 s91, s6 -; SI-NEXT: v_or_b32_e32 v59, v31, v1 +; SI-NEXT: v_or_b32_e32 v28, v59, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_readlane_b32 s6, v62, 50 -; SI-NEXT: v_or_b32_e32 v10, s14, v59 -; SI-NEXT: s_and_b32 s14, s6, 0xff -; SI-NEXT: s_lshl_b32 s15, s70, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v10, s14, v28 +; SI-NEXT: s_and_b32 s14, s82, 0xff +; SI-NEXT: s_lshl_b32 s15, s52, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v5, v24, v1 +; SI-NEXT: v_or_b32_e32 v60, v24, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_readlane_b32 s6, v62, 19 -; SI-NEXT: v_or_b32_e32 v9, s14, v5 +; SI-NEXT: v_or_b32_e32 v9, s14, v60 ; SI-NEXT: s_and_b32 s14, s90, 0xff -; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: s_lshl_b32 s15, s16, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v13, v25, v1 +; SI-NEXT: v_or_b32_e32 v31, v44, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v8, s14, v13 -; SI-NEXT: s_and_b32 s14, s27, 0xff -; SI-NEXT: s_lshl_b32 s15, s24, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v8, s14, v31 +; SI-NEXT: s_and_b32 s14, s62, 0xff +; SI-NEXT: s_lshl_b32 s15, s65, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v26, v31 -; SI-NEXT: v_or_b32_e32 v31, v27, v1 +; SI-NEXT: v_or_b32_e32 v61, v45, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, s14, v31 -; SI-NEXT: s_and_b32 s14, s49, 0xff -; SI-NEXT: s_lshl_b32 s15, s68, 8 +; SI-NEXT: v_or_b32_e32 v7, s14, v61 +; SI-NEXT: s_and_b32 s14, s98, 0xff +; SI-NEXT: s_lshl_b32 s15, s67, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_or_b32_e32 v60, v43, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v6, v47, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: v_or_b32_e32 v4, s14, v60 -; SI-NEXT: s_and_b32 s14, s6, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 16 -; SI-NEXT: s_lshl_b32 s15, s6, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v4, s14, v6 +; SI-NEXT: s_and_b32 s14, s92, 0xff +; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v23, v27 -; SI-NEXT: v_mov_b32_e32 v27, v24 -; SI-NEXT: v_mov_b32_e32 v24, v61 -; SI-NEXT: v_or_b32_e32 v61, v44, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v22, v14 +; SI-NEXT: v_or_b32_e32 v14, v56, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v2, s14, v61 -; SI-NEXT: s_and_b32 s14, s71, 0xff -; SI-NEXT: s_lshl_b32 s15, s36, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s14, v14 +; SI-NEXT: s_and_b32 s14, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: s_or_b32 s42, s8, s63 +; SI-NEXT: v_readlane_b32 s8, v62, 56 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s6, v62, 48 -; SI-NEXT: v_or_b32_e32 v6, v45, v1 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: v_mov_b32_e32 v23, v18 +; SI-NEXT: v_or_b32_e32 v18, v57, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_and_b32 s8, s6, 0xffff -; SI-NEXT: v_readlane_b32 s6, v62, 49 -; SI-NEXT: v_or_b32_e32 v1, s14, v6 -; SI-NEXT: s_and_b32 s14, s79, 0xff -; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: s_or_b32 s40, s8, s5 +; SI-NEXT: v_readlane_b32 s8, v62, 57 +; SI-NEXT: v_or_b32_e32 v1, s14, v18 +; SI-NEXT: s_and_b32 s14, s88, 0xff +; SI-NEXT: s_lshl_b32 s15, s79, 8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: s_or_b32 s42, s8, s63 -; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_readlane_b32 s9, v62, 60 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s40, s8, s21 -; SI-NEXT: s_and_b32 s8, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v14, v46, v3 -; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s8, s41 -; SI-NEXT: s_and_b32 s8, s13, 0xffff -; SI-NEXT: v_or_b32_e32 v3, s14, v14 -; SI-NEXT: s_or_b32 s14, s8, s43 -; SI-NEXT: s_and_b32 s8, s16, 0xffff -; SI-NEXT: s_and_b32 s16, s73, 0xffff -; SI-NEXT: s_or_b32 s13, s8, s44 -; SI-NEXT: s_and_b32 s8, s62, 0xffff -; SI-NEXT: s_or_b32 s35, s16, s56 +; SI-NEXT: v_readlane_b32 s8, v62, 58 +; SI-NEXT: s_and_b32 s16, s9, 0xffff +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v26, v24 +; SI-NEXT: v_mov_b32_e32 v24, v19 +; SI-NEXT: v_or_b32_e32 v19, v58, v3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s36, s16, s56 ; SI-NEXT: s_and_b32 s16, s48, 0xffff -; SI-NEXT: s_or_b32 s12, s8, s45 -; SI-NEXT: s_and_b32 s8, s10, 0xffff -; SI-NEXT: s_or_b32 s52, s16, vcc_lo +; SI-NEXT: v_or_b32_e32 v3, s14, v19 +; SI-NEXT: s_or_b32 s14, s8, s43 +; SI-NEXT: s_and_b32 s8, s13, 0xffff +; SI-NEXT: s_or_b32 s53, s16, vcc_lo ; SI-NEXT: s_and_b32 s16, s72, 0xffff -; SI-NEXT: s_or_b32 s10, s8, s46 -; SI-NEXT: s_and_b32 s8, s94, 0xffff +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s12, 0xffff ; SI-NEXT: s_or_b32 s94, s16, vcc_hi ; SI-NEXT: s_and_b32 s16, s57, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s45 +; SI-NEXT: s_and_b32 s8, s10, 0xffff ; SI-NEXT: s_or_b32 s49, s16, s74 ; SI-NEXT: s_and_b32 s16, s58, 0xffff +; SI-NEXT: s_or_b32 s10, s8, s46 +; SI-NEXT: v_readlane_b32 s8, v62, 59 ; SI-NEXT: s_or_b32 s48, s16, s75 ; SI-NEXT: s_and_b32 s16, s59, 0xffff -; SI-NEXT: s_mov_b32 s25, s23 +; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s11, s16, s76 ; SI-NEXT: s_and_b32 s16, s60, 0xffff ; SI-NEXT: s_and_b32 s23, s61, 0xffff -; SI-NEXT: s_mov_b32 s87, s34 -; SI-NEXT: s_mov_b32 s34, s55 -; SI-NEXT: s_mov_b32 s55, s22 +; SI-NEXT: s_mov_b32 s30, s87 +; SI-NEXT: s_mov_b32 s87, s85 ; SI-NEXT: s_or_b32 s8, s8, s47 ; SI-NEXT: s_or_b32 s9, s16, s77 ; SI-NEXT: s_or_b32 s16, s23, s78 -; SI-NEXT: s_mov_b32 s22, s18 -; SI-NEXT: v_mov_b32_e32 v49, v30 ; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_mov_b32_e32 v18, v43 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v30, v37 ; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v22, v46 +; SI-NEXT: v_mov_b32_e32 v20, v47 +; SI-NEXT: v_mov_b32_e32 v49, v56 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v57 +; SI-NEXT: v_mov_b32_e32 v25, v58 ; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 ; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 ; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 ; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 ; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 ; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, s35, v59, 16 -; SI-NEXT: v_alignbit_b32 v41, s52, v5, 16 -; SI-NEXT: v_alignbit_b32 v40, s94, v13, 16 -; SI-NEXT: v_alignbit_b32 v55, s49, v31, 16 -; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_alignbit_b32 v54, s48, v60, 16 -; SI-NEXT: v_mov_b32_e32 v60, v25 -; SI-NEXT: v_mov_b32_e32 v25, v28 -; SI-NEXT: v_alignbit_b32 v53, s11, v61, 16 -; SI-NEXT: v_mov_b32_e32 v61, v24 -; SI-NEXT: v_mov_b32_e32 v24, v27 -; SI-NEXT: v_alignbit_b32 v52, s9, v6, 16 -; SI-NEXT: v_alignbit_b32 v51, s16, v14, 16 +; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16 +; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16 +; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16 +; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16 +; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16 +; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v22 +; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16 +; SI-NEXT: v_mov_b32_e32 v18, v23 +; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16 +; SI-NEXT: v_mov_b32_e32 v19, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 ; SI-NEXT: s_lshr_b32 s73, s63, 16 -; SI-NEXT: s_lshr_b32 s72, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s63, s41, 16 ; SI-NEXT: s_lshr_b32 s62, s43, 16 ; SI-NEXT: s_lshr_b32 s61, s44, 16 @@ -209550,573 +209672,477 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 ; SI-NEXT: s_lshr_b32 s46, s74, 16 -; SI-NEXT: s_mov_b32 s74, s51 -; SI-NEXT: s_mov_b32 s51, s67 -; SI-NEXT: v_readlane_b32 s67, v62, 51 +; SI-NEXT: v_readlane_b32 s25, v62, 62 +; SI-NEXT: v_readlane_b32 s74, v62, 61 ; SI-NEXT: s_lshr_b32 s45, s75, 16 -; SI-NEXT: s_mov_b32 s23, s25 -; SI-NEXT: s_mov_b32 s21, s55 -; SI-NEXT: s_mov_b32 s55, s34 -; SI-NEXT: s_mov_b32 s75, s54 -; SI-NEXT: s_mov_b32 s34, s87 -; SI-NEXT: v_readlane_b32 s25, v62, 53 -; SI-NEXT: v_readlane_b32 s87, v62, 52 ; SI-NEXT: s_lshr_b32 s44, s76, 16 -; SI-NEXT: v_readlane_b32 s54, v62, 50 +; SI-NEXT: s_mov_b32 s76, s37 +; SI-NEXT: s_mov_b32 s37, s97 +; SI-NEXT: s_mov_b32 s97, s21 +; SI-NEXT: s_mov_b32 s21, s18 +; SI-NEXT: s_mov_b32 s18, s17 +; SI-NEXT: s_mov_b32 s85, s87 +; SI-NEXT: s_mov_b32 s87, s30 +; SI-NEXT: s_mov_b32 s17, s24 ; SI-NEXT: s_lshr_b32 s43, s77, 16 -; SI-NEXT: s_mov_b32 s76, s69 -; SI-NEXT: s_mov_b32 s69, s95 -; SI-NEXT: s_mov_b32 s77, s84 -; SI-NEXT: s_mov_b32 s84, s99 -; SI-NEXT: s_mov_b32 s99, s93 +; SI-NEXT: s_mov_b32 s77, s35 +; SI-NEXT: s_mov_b32 s35, s64 +; SI-NEXT: s_mov_b32 s64, s55 +; SI-NEXT: s_mov_b32 s55, s83 +; SI-NEXT: s_mov_b32 s83, s99 +; SI-NEXT: s_mov_b32 s99, s89 +; SI-NEXT: s_mov_b32 s89, s39 +; SI-NEXT: s_mov_b32 s39, s95 +; SI-NEXT: s_mov_b32 s95, s6 ; SI-NEXT: s_lshr_b32 s41, s78, 16 -; SI-NEXT: s_mov_b32 s78, s31 -; SI-NEXT: s_mov_b32 s31, s50 -; SI-NEXT: s_mov_b32 s50, s53 -; SI-NEXT: s_mov_b32 s53, s82 -; SI-NEXT: s_mov_b32 s82, s83 -; SI-NEXT: s_mov_b32 s83, s85 -; SI-NEXT: s_mov_b32 s85, s4 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v6, v20 -; SI-NEXT: v_mov_b32_e32 v13, v19 -; SI-NEXT: v_mov_b32_e32 v14, v29 -; SI-NEXT: s_branch .LBB97_3 -; SI-NEXT: .LBB97_2: -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v22, v46 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v44 -; SI-NEXT: v_mov_b32_e32 v18, v43 -; SI-NEXT: v_mov_b32_e32 v23, v27 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: .LBB97_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v5, v39 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB97_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s79, 3 +; SI-NEXT: s_mov_b32 s24, s96 +; SI-NEXT: s_mov_b32 s96, s4 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_add_i32 s4, s88, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_add_i32 s4, s82, 3 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 10 +; SI-NEXT: v_readlane_b32 s6, v62, 9 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 -; SI-NEXT: s_add_i32 s8, s31, 3 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s50, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s71, 0xff -; SI-NEXT: s_lshl_b32 s8, s36, 8 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s8, s6, 8 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_readlane_b32 s7, v62, 12 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: s_add_i32 s36, s86, 3 -; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 11 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 ; SI-NEXT: v_or_b32_e32 v2, s5, v2 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: s_add_i32 s9, s7, 3 +; SI-NEXT: s_add_i32 s5, s35, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s8, s6, 8 +; SI-NEXT: s_add_i32 s9, s77, 3 ; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_readlane_b32 s6, v62, 12 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s8, s78, 24 +; SI-NEXT: s_lshl_b32 s8, s6, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_add_i32 s16, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: s_add_i32 s9, s5, 0x3000000 -; SI-NEXT: s_add_i32 s79, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: s_add_i32 s79, s92, 3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 +; SI-NEXT: s_add_i32 s16, s4, 0x3000000 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 +; SI-NEXT: s_add_i32 s9, s5, 0x3000000 ; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_add_i32 s4, s77, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 15 -; SI-NEXT: v_readlane_b32 s6, v62, 14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s4, s24, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_add_i32 s8, s37, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 13 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s51, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s52, s98, 3 ; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 32 -; SI-NEXT: s_add_i32 s53, s4, 3 -; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v18, v4 -; SI-NEXT: s_add_i32 s93, s98, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: s_add_i32 s30, s87, 3 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s93, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s8, s68, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s81, 24 +; SI-NEXT: s_lshl_b32 s5, s28, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s86, s27, 3 ; SI-NEXT: s_add_i32 s48, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s5, s24, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: v_readlane_b32 s4, v62, 42 +; SI-NEXT: v_mov_b32_e32 v22, v30 +; SI-NEXT: s_add_i32 s87, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 39 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v23, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 38 -; SI-NEXT: s_add_i32 s68, s4, 3 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s97, 8 -; SI-NEXT: s_add_i32 s8, s67, 3 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: v_readlane_b32 s4, v62, 32 +; SI-NEXT: s_add_i32 s67, s4, 3 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 +; SI-NEXT: s_add_i32 s8, s69, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s51, 24 +; SI-NEXT: s_lshl_b32 s5, s74, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s50, s90, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 19 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v5 +; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: s_add_i32 s49, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s50, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v25, v5 -; SI-NEXT: s_add_i32 s94, s20, 3 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_add_i32 s94, s86, 3 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s85, 8 -; SI-NEXT: s_add_i32 s8, s83, 3 +; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_add_i32 s8, s71, 3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s84, 24 +; SI-NEXT: s_lshl_b32 s5, s81, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s20, s54, 3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v5 ; SI-NEXT: s_add_i32 s94, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v49 +; SI-NEXT: v_readlane_b32 s4, v62, 52 +; SI-NEXT: s_add_i32 s18, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 51 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v24, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s98, s38, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 20 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_readlane_b32 s4, v62, 45 +; SI-NEXT: s_add_i32 s98, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 43 +; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_and_b32 s4, s98, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s92, 3 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s5, s31, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s17, s37, 3 -; SI-NEXT: s_add_i32 s52, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_add_i32 s53, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 41 +; SI-NEXT: s_add_i32 s86, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 38 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 34 -; SI-NEXT: v_readlane_b32 s6, v62, 29 +; SI-NEXT: v_or_b32_e32 v10, v59, v10 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: v_readlane_b32 s4, v62, 31 +; SI-NEXT: s_add_i32 s66, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 27 +; SI-NEXT: s_and_b32 s4, s66, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s38, s6, 3 -; SI-NEXT: s_and_b32 s8, s38, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s85, s25, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 40 -; SI-NEXT: s_add_i32 s70, s6, 3 -; SI-NEXT: s_and_b32 s7, s70, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_add_i32 s19, s69, 3 -; SI-NEXT: s_add_i32 s51, s30, 3 -; SI-NEXT: s_add_i32 s95, s89, 3 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v39, s9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 -; SI-NEXT: v_mov_b32_e32 v27, s48 -; SI-NEXT: v_mov_b32_e32 v26, s49 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_mov_b32_e32 v24, s52 -; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 -; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 -; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 -; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 -; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 -; SI-NEXT: s_lshr_b32 s56, s52, 16 -; SI-NEXT: s_lshr_b32 s47, s94, 16 -; SI-NEXT: s_lshr_b32 s46, s49, 16 -; SI-NEXT: s_lshr_b32 s45, s48, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s9, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 37 -; SI-NEXT: s_add_i32 s67, s4, 3 -; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_add_i32 s37, s39, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 30 +; SI-NEXT: s_and_b32 s8, s37, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s95, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 46 -; SI-NEXT: s_add_i32 s35, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_add_i32 s36, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 50 +; SI-NEXT: s_add_i32 s21, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 49 +; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: v_mov_b32_e32 v23, s35 -; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 -; SI-NEXT: s_lshr_b32 s57, s35, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s5, v62, 37 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s71, s22, 3 +; SI-NEXT: s_and_b32 s8, s71, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s35, s99, 3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: v_readlane_b32 s4, v62, 40 +; SI-NEXT: s_add_i32 s85, s4, 3 ; SI-NEXT: s_and_b32 s4, s85, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 36 +; SI-NEXT: v_readlane_b32 s5, v62, 33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 47 ; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 54 +; SI-NEXT: s_add_i32 s17, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 53 +; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 44 +; SI-NEXT: v_readlane_b32 s5, v62, 47 ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_and_b32 s6, s35, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v39, s9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: v_mov_b32_e32 v24, s53 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 +; SI-NEXT: v_mov_b32_e32 v23, s36 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: v_mov_b32_e32 v22, s8 ; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 ; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_lshr_b32 s57, s36, 16 +; SI-NEXT: s_lshr_b32 s56, s53, 16 +; SI-NEXT: s_lshr_b32 s47, s94, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s48, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s9, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 45 -; SI-NEXT: s_add_i32 s6, s4, 3 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 27 -; SI-NEXT: s_add_i32 s34, s6, 3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 48 +; SI-NEXT: s_add_i32 s7, s4, 3 +; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 28 -; SI-NEXT: s_and_b32 s6, s34, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s5, s89, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s10, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 43 -; SI-NEXT: s_add_i32 s97, s4, 3 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 46 +; SI-NEXT: s_add_i32 s99, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 44 +; SI-NEXT: s_and_b32 s4, s99, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 41 -; SI-NEXT: v_readlane_b32 s6, v62, 33 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s54, s6, 3 -; SI-NEXT: s_and_b32 s6, s54, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v20, s10 -; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 -; SI-NEXT: s_lshr_b32 s59, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v14, v5 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 42 +; SI-NEXT: v_readlane_b32 s4, v62, 36 ; SI-NEXT: s_add_i32 s81, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 35 +; SI-NEXT: v_readlane_b32 s6, v62, 28 ; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s55, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 31 +; SI-NEXT: v_readlane_b32 s5, v62, 26 +; SI-NEXT: s_and_b32 s6, s55, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s12, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 39 +; SI-NEXT: v_readlane_b32 s4, v62, 34 ; SI-NEXT: s_add_i32 s69, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 35 +; SI-NEXT: v_readlane_b32 s5, v62, 29 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 25 -; SI-NEXT: v_readlane_b32 s6, v62, 24 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 22 +; SI-NEXT: s_add_i32 s34, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 21 +; SI-NEXT: v_readlane_b32 s6, v62, 19 +; SI-NEXT: s_and_b32 s4, s34, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s92, s6, 3 -; SI-NEXT: s_and_b32 s6, s92, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v19, s12 -; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 -; SI-NEXT: s_lshr_b32 s60, s12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v13, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 26 -; SI-NEXT: s_add_i32 s31, s4, 3 -; SI-NEXT: s_and_b32 s4, s31, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s92, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s87, 24 +; SI-NEXT: s_lshl_b32 s5, s97, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s13, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 25 +; SI-NEXT: s_add_i32 s51, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 24 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 8 -; SI-NEXT: s_add_i32 s6, s96, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v18, s13 -; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 -; SI-NEXT: s_lshr_b32 s61, s13, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v14, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 20 +; SI-NEXT: s_add_i32 s95, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 18 ; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s96, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s76, 24 +; SI-NEXT: s_lshl_b32 s5, s80, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s14, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 22 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 21 +; SI-NEXT: s_add_i32 s4, s93, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: s_add_i32 s6, s55, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 -; SI-NEXT: s_lshr_b32 s62, s14, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_add_i32 s4, s80, 3 +; SI-NEXT: s_add_i32 s4, s83, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s6, s64, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s64, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: v_readlane_b32 s5, v62, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s26, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 1 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s27, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -210126,42 +210152,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s74, 3 +; SI-NEXT: s_add_i32 s6, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s21, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s40, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: v_readlane_b32 s4, v62, 7 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 23 -; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: v_readlane_b32 s5, v62, 17 +; SI-NEXT: v_readlane_b32 s6, v62, 6 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 8 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s19, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s91, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 6 -; SI-NEXT: v_readlane_b32 s6, v62, 5 +; SI-NEXT: s_add_i32 s18, s4, 0x3000000 +; SI-NEXT: s_add_i32 s4, s20, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_readlane_b32 s6, v62, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: v_readlane_b32 s5, v62, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 @@ -210170,18 +210195,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s42, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v13, s91 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v19, s12 +; SI-NEXT: v_mov_b32_e32 v18, s13 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v6 ; SI-NEXT: v_mov_b32_e32 v6, s15 ; SI-NEXT: v_alignbit_b32 v57, s42, v13, 16 ; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 ; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 ; SI-NEXT: s_lshr_b32 s73, s42, 16 ; SI-NEXT: s_lshr_b32 s72, s40, 16 ; SI-NEXT: s_lshr_b32 s63, s15, 16 -; SI-NEXT: .LBB97_5: ; %end -; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_lshr_b32 s62, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_lshr_b32 s60, s12, 16 +; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: s_and_b32 s4, s42, 0xffff @@ -210190,7 +210230,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s17, 0xffff @@ -210287,7 +210326,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 -; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_and_b32 s4, s36, 0xffff ; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 @@ -210300,7 +210339,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 -; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_and_b32 s4, s53, 0xffff ; SI-NEXT: s_lshl_b32 s5, s56, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 @@ -210436,11 +210475,91 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: v_mov_b32_e32 v5, v13 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v58 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v57 +; SI-NEXT: v_mov_b32_e32 v49, v56 +; SI-NEXT: v_mov_b32_e32 v20, v47 +; SI-NEXT: v_mov_b32_e32 v30, v37 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: ; VI: ; %bb.0: @@ -210831,17 +210950,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v45, v62 ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 ; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload @@ -212078,8 +212197,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -212112,7 +212231,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -212162,9 +212281,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -212197,11 +212316,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -212212,11 +212331,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 @@ -212347,9 +212466,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -212426,22 +212545,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 ; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 @@ -214272,13 +214390,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 @@ -214303,112 +214423,94 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -214522,6 +214624,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -214538,6 +214642,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -214568,10 +214679,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -214596,12 +214703,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -214613,37 +214718,54 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 @@ -214652,8 +214774,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v47 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v47 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214664,7 +214786,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v45, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214675,7 +214797,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v43, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214686,7 +214808,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v41, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214697,7 +214819,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v55, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214708,7 +214830,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v53, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -214719,103 +214841,102 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v51, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v50, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v49, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v27 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v48, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v39, v2, v23 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_or_b32_e32 v39, v2, v19 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v38, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v37, v2, v15 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_or_b32_e32 v37, v2, v29 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v36, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v35, v2, v29 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: v_or_b32_e32 v35, v2, v25 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v34, v2, v6 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v2, v21 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_or_b32_e32 v33, v2, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v31, v2, v19 +; SI-NEXT: v_or_b32_e32 v31, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v32, v2, v17 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: v_or_b32_e32 v32, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v26, v2, v25 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v26, v2, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v30, v2, v13 +; SI-NEXT: v_or_b32_e32 v30, v2, v3 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v18, v2, v11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_or_b32_e32 v18, v2, v21 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v22, v2, v9 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_or_b32_e32 v22, v2, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v14, v2, v7 +; SI-NEXT: v_or_b32_e32 v14, v2, v17 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v10, v2, v5 +; SI-NEXT: v_or_b32_e32 v10, v2, v9 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v6, v2, v3 +; SI-NEXT: v_or_b32_e32 v6, v2, v13 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v1 @@ -214843,17 +214964,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill @@ -214879,16 +215000,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v1, v60, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 @@ -214898,22 +215019,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v62, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill @@ -215030,31 +215151,31 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill @@ -215206,59 +215327,66 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v9, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 ; SI-NEXT: v_alignbit_b32 v12, v2, v6, 24 ; SI-NEXT: v_alignbit_b32 v20, v2, v6, 16 ; SI-NEXT: v_alignbit_b32 v47, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v62, v22, v18, 24 ; SI-NEXT: v_alignbit_b32 v63, v22, v18, 16 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v22 @@ -215266,71 +215394,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 ; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 ; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -215338,12 +215464,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -215352,32 +215478,27 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 ; SI-NEXT: v_alignbit_b32 v56, v10, v14, 24 ; SI-NEXT: v_alignbit_b32 v57, v10, v14, 16 ; SI-NEXT: v_alignbit_b32 v61, v10, v14, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215385,7 +215506,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215393,7 +215514,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215401,7 +215522,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215409,7 +215530,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215417,7 +215538,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215425,7 +215546,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215433,7 +215554,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215441,7 +215562,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215449,7 +215570,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -215457,7 +215578,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -216239,7 +216360,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -217510,8 +217631,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 @@ -217636,7 +217757,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -217755,7 +217876,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 @@ -217893,7 +218013,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] @@ -218309,13 +218428,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 @@ -218334,13 +218454,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 @@ -218359,13 +218480,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 @@ -218384,13 +218506,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 @@ -218409,13 +218532,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 @@ -219761,14 +219885,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -219850,14 +219974,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43 ; SI-NEXT: v_writelane_b32 v62, s6, 0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill @@ -225746,12 +225870,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -225788,8 +225906,12 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 @@ -225803,11 +225925,11 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 @@ -225820,21 +225942,22 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -233259,27 +233382,23 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -233298,11 +233417,20 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 @@ -233317,35 +233445,36 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -233476,217 +233605,211 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 @@ -233754,18 +233877,11 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 @@ -233773,14 +233889,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 @@ -233789,16 +233897,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 @@ -233806,20 +233904,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 @@ -233828,16 +233912,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 @@ -233846,20 +233920,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 ; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 ; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 @@ -233868,16 +233928,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 @@ -233886,20 +233936,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 @@ -233908,242 +233944,330 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, v24 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234151,11 +234275,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -234163,8 +234285,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234174,8 +234296,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234185,8 +234307,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234196,8 +234318,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234207,8 +234329,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234218,8 +234340,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234229,8 +234351,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234240,8 +234362,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234251,8 +234373,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234262,8 +234384,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234273,8 +234395,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234284,8 +234406,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234295,8 +234417,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234306,8 +234428,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234317,8 +234439,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234328,8 +234450,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234339,8 +234461,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234350,8 +234472,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234361,8 +234483,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234372,8 +234494,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234383,8 +234505,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234394,8 +234516,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234405,8 +234527,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234416,8 +234538,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234427,8 +234549,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234438,8 +234560,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234449,8 +234571,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234460,8 +234582,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234471,8 +234593,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -234482,8 +234604,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -236259,145 +236381,173 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v15 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v20 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v27 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v28 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v35 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v36 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v39 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v48 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v47 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v61 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v31 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v53 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -236410,720 +236560,672 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v31, v35, v31, 16 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_alignbit_b32 v31, v35, v31, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v10, v1, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v18, v12, v1, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v16, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 +; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_mov_b32_e32 v55, v18 -; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v24, v45, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v53, v54, v11, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v35, v36, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v2, v35, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v53, v3, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 -; SI-NEXT: v_mov_b32_e32 v31, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 ; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v24, v4, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v23, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v22, v5, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v21, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v9, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v31, v10, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237137,8 +237239,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237153,8 +237255,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237169,8 +237271,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237185,8 +237287,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237201,8 +237303,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237217,8 +237319,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237233,8 +237335,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237243,14 +237345,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237259,7 +237361,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -237267,8 +237369,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237277,8 +237379,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237287,8 +237389,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237297,8 +237399,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237307,8 +237409,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237317,8 +237419,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237327,8 +237429,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237337,8 +237439,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237347,8 +237449,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237357,8 +237459,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237367,8 +237469,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237376,17 +237478,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -237394,10 +237496,12 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -242750,15 +242854,15 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -242777,14 +242881,14 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 @@ -242857,17 +242961,14 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 @@ -242900,10 +243001,9 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -242951,111 +243051,118 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 -; SI-NEXT: v_add_i32_e32 v46, vcc, 0x30000, v3 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v9, v52, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v50, v3 -; SI-NEXT: v_or_b32_e32 v1, v51, v1 ; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v49, v3 ; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v48, v3 ; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v39, v3 ; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v34, v3 ; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v3 @@ -243063,11 +243170,11 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v33, v3 ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v32, v3 ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 @@ -243078,24 +243185,20 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v54, v5 ; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v53, v7 ; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v9, v52, v9 ; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v53 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -243107,8 +243210,8 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -243158,6 +243261,22 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 @@ -243169,28 +243288,15 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48 ; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v51 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v53 ; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v55 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 @@ -243269,7 +243375,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 @@ -245096,531 +245202,533 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v59 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v49 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v52 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v36 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v56 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v56, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v62, v23 -; SI-NEXT: v_mov_b32_e32 v63, v11 +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v29, v27 +; SI-NEXT: v_mov_b32_e32 v57, v23 +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: v_mov_b32_e32 v62, v4 +; SI-NEXT: v_mov_b32_e32 v63, v49 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v36, v36, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_or_b32_e32 v35, v35, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v58 -; SI-NEXT: v_or_b32_e32 v57, v37, v48 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_or_b32_e32 v41, v11, v37 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_or_b32_e32 v17, v17, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v21, v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v25, v25, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v61, v3, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; SI-NEXT: v_or_b32_e32 v31, v31, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 ; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v52, v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v55, v37, v48 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v55, v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v43, v37, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v62 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v37, v39 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v45 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_or_b32_e32 v45, v42, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v59 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_alignbit_b32 v39, v13, v39, 16 -; SI-NEXT: v_or_b32_e32 v50, v50, v51 -; SI-NEXT: v_alignbit_b32 v51, v57, v51, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_or_b32_e32 v11, v37, v1 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v1, v43, v1, 16 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v38, v38, v47 +; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v45, v45, v50 +; SI-NEXT: v_or_b32_e32 v41, v41, v30 +; SI-NEXT: v_or_b32_e32 v46, v46, v32 +; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 +; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v48, v4 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v3, v39, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v4, v55, v4, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_or_b32_e32 v11, v37, v8 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v8, v5, v8, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 -; SI-NEXT: v_or_b32_e32 v11, v48, v10 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_alignbit_b32 v10, v2, v10, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 -; SI-NEXT: v_or_b32_e32 v11, v49, v37 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v3, v39, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v63 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_or_b32_e32 v63, v49, v48 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v47 -; SI-NEXT: v_or_b32_e32 v62, v49, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v60 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_alignbit_b32 v24, v31, v24, 16 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v60, v49, v28 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 -; SI-NEXT: v_or_b32_e32 v38, v38, v49 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_alignbit_b32 v46, v41, v52, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v62, v56, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 +; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_or_b32_e32 v60, v56, v39 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v57, v56, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v29, v29, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_or_b32_e32 v11, v53, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v61 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v59, v56, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v63, v56, v35 +; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v3, v49, v51 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v16, v37, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v19, v48, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v25, v23, 16 -; SI-NEXT: v_or_b32_e32 v61, v53, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_alignbit_b32 v30, v33, v30, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 -; SI-NEXT: v_or_b32_e32 v54, v53, v32 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v47 -; SI-NEXT: v_alignbit_b32 v47, v17, v49, 16 -; SI-NEXT: v_or_b32_e32 v59, v42, v53 -; SI-NEXT: v_alignbit_b32 v42, v21, v28, 16 -; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 -; SI-NEXT: v_alignbit_b32 v56, v36, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16 +; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v34, v37, v34 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v34, v34, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -245633,14 +245741,14 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -245653,14 +245761,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -245677,82 +245787,84 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index d966d136d75b6..f0fa7e95c75ef 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -4568,14 +4568,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 @@ -4595,18 +4595,18 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 @@ -4862,13 +4862,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -4888,18 +4888,18 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_2 @@ -5074,13 +5074,13 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -5100,18 +5100,18 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_2 @@ -12053,14 +12053,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 @@ -12080,18 +12080,18 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v44 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 @@ -12347,13 +12347,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -12373,18 +12373,18 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB34_2 @@ -12559,13 +12559,13 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 @@ -12585,18 +12585,18 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v44 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_2 @@ -19423,13 +19423,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 @@ -19451,17 +19451,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19646,13 +19646,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v35, v4 ; GFX9-NEXT: v_mov_b32_e32 v33, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 @@ -19674,17 +19674,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25685,14 +25685,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_mov_b32_e32 v31, v2 ; SI-NEXT: v_mov_b32_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 @@ -25700,7 +25700,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v11 ; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 ; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v17 @@ -25726,23 +25726,23 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25786,7 +25786,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v51 ; SI-NEXT: v_or_b32_e32 v6, v6, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v53 ; SI-NEXT: v_or_b32_e32 v6, v6, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 @@ -25794,13 +25794,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 ; SI-NEXT: v_or_b32_e32 v0, v0, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v38 ; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: v_or_b32_e32 v3, v3, v48 ; SI-NEXT: v_or_b32_e32 v4, v4, v49 -; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 ; SI-NEXT: v_or_b32_e32 v6, v6, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -25827,14 +25827,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -25853,12 +25853,12 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 @@ -25915,7 +25915,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -25998,13 +25998,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 @@ -26026,17 +26026,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26221,13 +26221,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v35, v4 ; GFX9-NEXT: v_mov_b32_e32 v33, v2 ; GFX9-NEXT: v_mov_b32_e32 v36, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 @@ -26249,17 +26249,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -30050,13 +30050,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 @@ -30079,17 +30079,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -30355,13 +30355,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -30383,17 +30383,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -30578,13 +30578,13 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -30606,17 +30606,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -34694,13 +34694,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 @@ -34723,17 +34723,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -34999,13 +34999,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -35027,17 +35027,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -35222,13 +35222,13 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_mov_b32_e32 v33, v4 ; GFX9-NEXT: v_mov_b32_e32 v32, v2 ; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 @@ -35250,17 +35250,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 397955a8a8928..a14008d65743e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -11396,170 +11396,171 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v12 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v51 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v42 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -11567,18 +11568,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -11587,7 +11588,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -11595,30 +11595,30 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 @@ -11638,54 +11638,59 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v45 ; SI-NEXT: v_or_b32_e32 v9, v9, v59 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v41 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v19 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -11712,21 +11717,20 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: .LBB26_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 @@ -11734,55 +11738,54 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -11791,20 +11794,20 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -11820,35 +11823,34 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -11874,68 +11876,70 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v59, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB26_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -11955,7 +11959,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16i32: @@ -11993,38 +11997,31 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -12035,38 +12032,52 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12077,28 +12088,28 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12125,18 +12136,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -12166,23 +12177,23 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -12216,24 +12227,24 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: .LBB26_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB26_4 @@ -12244,27 +12255,27 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v51 -; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v40 +; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 3, v30 -; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v23 +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v60 -; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 3, v47 -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v58 +; VI-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v45 +; VI-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12300,18 +12311,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 @@ -12350,39 +12361,39 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 ; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_add_u16_e32 v9, 3, v52 ; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v49 -; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v29 +; VI-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v63 -; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v26 +; VI-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v48 -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v61 +; VI-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v44 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_add_u16_e32 v14, 3, v41 ; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v14, v14, v16 ; VI-NEXT: v_add_u16_e32 v16, 3, v19 -; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: .LBB26_4: ; %end @@ -12441,39 +12452,32 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -12483,50 +12487,57 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12536,29 +12547,29 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12585,18 +12596,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -12626,23 +12637,23 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -12676,24 +12687,24 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: .LBB26_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB26_4 @@ -12703,28 +12714,28 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 -; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -12761,18 +12772,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 @@ -12810,39 +12821,39 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: .LBB26_4: ; %end @@ -26751,170 +26762,171 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v12 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v51 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v42 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -26922,18 +26934,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -26942,7 +26954,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -26950,30 +26961,30 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 @@ -26993,54 +27004,59 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v45 ; SI-NEXT: v_or_b32_e32 v9, v9, v59 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v41 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v19 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -27067,21 +27083,20 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 @@ -27089,55 +27104,54 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -27146,20 +27160,20 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -27175,35 +27189,34 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -27229,68 +27242,70 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v59, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB50_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -27310,7 +27325,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16f32: @@ -27348,38 +27363,31 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -27390,38 +27398,52 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27432,28 +27454,28 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27480,18 +27502,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -27521,23 +27543,23 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -27571,24 +27593,24 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB50_4 @@ -27599,27 +27621,27 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v51 -; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v40 +; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 3, v30 -; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v23 +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v60 -; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 3, v47 -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v58 +; VI-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v45 +; VI-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27655,18 +27677,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 @@ -27705,39 +27727,39 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 ; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_add_u16_e32 v9, 3, v52 ; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v49 -; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v29 +; VI-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v63 -; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v26 +; VI-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v48 -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v61 +; VI-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v44 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_add_u16_e32 v14, 3, v41 ; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v14, v14, v16 ; VI-NEXT: v_add_u16_e32 v16, 3, v19 -; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: .LBB50_4: ; %end @@ -27796,39 +27818,32 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -27838,50 +27853,57 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27891,29 +27913,29 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27940,18 +27962,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -27981,23 +28003,23 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -28031,24 +28053,24 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 @@ -28058,28 +28080,28 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 -; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -28116,18 +28138,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 @@ -28165,39 +28187,39 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: .LBB50_4: ; %end @@ -41377,170 +41399,171 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v12 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v51 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v42 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -41548,18 +41571,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -41568,7 +41591,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -41576,30 +41598,30 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 @@ -41619,54 +41641,59 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v45 ; SI-NEXT: v_or_b32_e32 v9, v9, v59 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v41 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v19 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -41693,21 +41720,20 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 @@ -41715,55 +41741,54 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -41772,20 +41797,20 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -41801,35 +41826,34 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -41855,68 +41879,70 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v59, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB70_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -41936,7 +41962,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8i64: @@ -41974,38 +42000,31 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -42016,38 +42035,52 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -42058,28 +42091,28 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42106,18 +42139,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -42147,23 +42180,23 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -42197,24 +42230,24 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: .LBB70_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB70_4 @@ -42225,27 +42258,27 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v51 -; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v40 +; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 3, v30 -; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v23 +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v60 -; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 3, v47 -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v58 +; VI-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v45 +; VI-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42281,18 +42314,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 @@ -42331,39 +42364,39 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 ; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_add_u16_e32 v9, 3, v52 ; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v49 -; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v29 +; VI-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v63 -; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v26 +; VI-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v48 -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v61 +; VI-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v44 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_add_u16_e32 v14, 3, v41 ; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v14, v14, v16 ; VI-NEXT: v_add_u16_e32 v16, 3, v19 -; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: .LBB70_4: ; %end @@ -42422,39 +42455,32 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -42464,50 +42490,57 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -42517,29 +42550,29 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42566,18 +42599,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -42607,23 +42640,23 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -42657,24 +42690,24 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: .LBB70_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB70_4 @@ -42684,28 +42717,28 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 -; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -42742,18 +42775,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 @@ -42791,39 +42824,39 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: .LBB70_4: ; %end @@ -55157,170 +55190,171 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v34, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v12 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v51 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v42 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -55328,18 +55362,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -55348,7 +55382,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -55356,30 +55389,30 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v22 @@ -55399,54 +55432,59 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v45 ; SI-NEXT: v_or_b32_e32 v9, v9, v59 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v41 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v51 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v13, v13, v19 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -55473,21 +55511,20 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: .LBB86_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 @@ -55495,55 +55532,54 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v60, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v46, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v42, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v12, v54, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v53, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -55552,20 +55588,20 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: v_or_b32_e32 v4, v16, v4 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -55581,35 +55617,34 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v8, v63, v8 -; SI-NEXT: v_or_b32_e32 v16, v52, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -55635,68 +55670,70 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v59, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v10, v46, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB86_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -55716,7 +55753,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8f64: @@ -55754,38 +55791,31 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -55796,38 +55826,52 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -55838,28 +55882,28 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -55886,18 +55930,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -55927,23 +55971,23 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -55977,24 +56021,24 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: .LBB86_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB86_4 @@ -56005,27 +56049,27 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v9, 3, v51 -; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v40 +; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 3, v30 -; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v23 +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v60 -; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 3, v47 -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v58 +; VI-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v45 +; VI-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -56061,18 +56105,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v2, 3, v2 @@ -56111,39 +56155,39 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 ; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_add_u16_e32 v9, 3, v52 ; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v49 -; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v29 +; VI-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v63 -; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v26 +; VI-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v48 -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v61 +; VI-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v44 -; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_add_u16_e32 v14, 3, v41 ; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_or_b32_e32 v14, v14, v16 ; VI-NEXT: v_add_u16_e32 v16, 3, v19 -; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 ; VI-NEXT: .LBB86_4: ; %end @@ -56202,39 +56246,32 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 @@ -56244,50 +56281,57 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -56297,29 +56341,29 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -56346,18 +56390,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -56387,23 +56431,23 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v43, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v61, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v41, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -56437,24 +56481,24 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: .LBB86_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB86_4 @@ -56464,28 +56508,28 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 -; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v50, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -56522,18 +56566,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 @@ -56571,39 +56615,39 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v63, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v10, v44, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 ; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v12, v51, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: .LBB86_4: ; %end @@ -69144,9 +69188,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 @@ -69159,394 +69203,389 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v4 ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v34 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v49 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v50 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v21, v21, v26 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v7, v3, v7 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v48, v1, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v11, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v11, v9, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v37, v26, v11 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v7, v9 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v7, v1 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 ; SI-NEXT: v_or_b32_e32 v49, v9, v7 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v1, v48, v11, 16 +; SI-NEXT: v_alignbit_b32 v13, v48, v11, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v15, v13, v9 ; SI-NEXT: v_alignbit_b32 v9, v49, v15, 16 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v35, v11, v15 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v19, v17, v13 +; SI-NEXT: v_or_b32_e32 v50, v9, v19 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v19, v13, v9 -; SI-NEXT: v_or_b32_e32 v50, v8, v19 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v9, v50, v0, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v16, v8 -; SI-NEXT: v_or_b32_e32 v51, v6, v8 +; SI-NEXT: v_or_b32_e32 v23, v63, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_alignbit_b32 v9, v50, v23, 16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_or_b32_e32 v33, v11, v23 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v13 +; SI-NEXT: v_or_b32_e32 v51, v6, v16 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v22, v16 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v22, v22, v60 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_or_b32_e32 v52, v17, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v20, v30, v17 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_alignbit_b32 v17, v52, v20, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v57 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v52, v14, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v18, v14 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_alignbit_b32 v13, v51, v6, 16 -; SI-NEXT: v_alignbit_b32 v17, v52, v14, 16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v18, v18, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_or_b32_e32 v53, v18, v10 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v42, v20 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v58, v23 -; SI-NEXT: v_or_b32_e32 v55, v22, v24 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v18, v44 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v54, v18, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v56, v18 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_or_b32_e32 v53, v21, v10 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v24 ; SI-NEXT: v_alignbit_b32 v21, v53, v2, 16 -; SI-NEXT: v_alignbit_b32 v25, v54, v18, 16 -; SI-NEXT: v_alignbit_b32 v29, v55, v5, 16 +; SI-NEXT: v_or_b32_e32 v54, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_alignbit_b32 v25, v54, v0, 16 +; SI-NEXT: v_or_b32_e32 v55, v8, v24 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: v_alignbit_b32 v29, v55, v8, 16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v22 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v37, v12, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v35, v11, v15 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v63 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v33, v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_or_b32_e32 v32, v11, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v32, v0, v6 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v34, v4, v20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v34, v0, v14 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v36, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v36, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v38, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v38, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 -; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v39, v0, v5 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_or_b32_e32 v39, v0, v8 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -69568,25 +69607,21 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 @@ -69596,156 +69631,155 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v56, v3 -; SI-NEXT: v_alignbit_b32 v29, v55, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v62 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_alignbit_b32 v25, v54, v38, 16 +; SI-NEXT: v_alignbit_b32 v29, v55, v39, 16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v53, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_or_b32_e32 v1, v26, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v10, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v53, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 ; SI-NEXT: v_alignbit_b32 v21, v53, v36, 16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v22, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v52, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v52, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v17, v52, v34, 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v16, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v13, v51, v32, 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v51 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -69774,8 +69808,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -69792,8 +69826,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v49, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -69811,7 +69845,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -69828,12 +69862,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 -; SI-NEXT: v_alignbit_b32 v1, v48, v37, 16 +; SI-NEXT: v_alignbit_b32 v0, v48, v37, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v0, v49, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -69861,6 +69898,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v12, v32 ; SI-NEXT: v_mov_b32_e32 v14, v51 ; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v52 ; SI-NEXT: v_mov_b32_e32 v20, v36 ; SI-NEXT: v_mov_b32_e32 v22, v53 @@ -69906,78 +69944,78 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v29 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v20 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -69989,79 +70027,80 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -70078,23 +70117,23 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v55, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -70128,98 +70167,92 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: .LBB98_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_add_u16_e32 v0, 3, v38 -; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_add_u16_e32 v0, 3, v16 ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 -; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v43 -; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v0, 3, v22 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v0, 3, v36 -; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v46 ; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v55 -; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_add_u16_e32 v0, 3, v44 +; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v48 -; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v29 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v49 -; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v1, 3, v61 -; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v63 +; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v16, v12 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v13, v16, v13 -; VI-NEXT: v_or_b32_e32 v14, v14, v26 -; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: v_or_b32_e32 v14, v14, v24 +; VI-NEXT: v_or_b32_e32 v15, v15, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v29, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -70252,54 +70285,58 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v6, v17, v6 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 ; VI-NEXT: v_or_b32_e32 v7, v17, v7 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v8, v17, v8 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 -; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v18 ; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v12, v17, v12 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v13, v17, v13 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 ; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 @@ -70359,99 +70396,100 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v29 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v20 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v22 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload @@ -70459,108 +70497,107 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v47, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v35, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -70594,111 +70631,110 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: .LBB98_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 ; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v47 ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 -; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 -; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 -; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 -; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 -; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v24, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v22, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v26, v7, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -70720,7 +70756,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -70731,40 +70767,40 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 ; GFX9-NEXT: .LBB98_4: ; %end @@ -71630,13 +71666,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: v_readfirstlane_b32 s15, v27 ; SI-NEXT: v_readfirstlane_b32 s40, v26 ; SI-NEXT: v_readfirstlane_b32 s12, v19 @@ -71668,14 +71704,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v38 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v39 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v48 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v59 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v62 @@ -71768,10 +71804,9 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v21, v21, v43 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v33, v58, v32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v46 ; SI-NEXT: s_or_b32 s4, s4, s56 ; SI-NEXT: v_or_b32_e32 v25, v54, v17 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -71789,13 +71824,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s56, s15, 8 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v48, v32, v63 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v57 ; SI-NEXT: s_or_b32 s4, s4, s56 ; SI-NEXT: v_or_b32_e32 v29, v44, v21 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v26, v26, v57 +; SI-NEXT: v_or_b32_e32 v26, v26, v56 ; SI-NEXT: v_or_b32_e32 v34, v61, v32 ; SI-NEXT: v_or_b32_e32 v32, s4, v29 ; SI-NEXT: s_and_b32 s4, s43, 0xff @@ -71833,7 +71868,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -71842,11 +71878,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 ; SI-NEXT: v_or_b32_e32 v1, v60, v1 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 @@ -71870,7 +71905,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v55 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -82280,29 +82315,53 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -82314,21 +82373,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v29 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -82336,205 +82393,104 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v4 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v6 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v10 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v26 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v35 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v37 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v38 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v39 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v49 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 -; SI-NEXT: v_or_b32_e32 v6, v6, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v44 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v16, v19, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v17, v17, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: v_or_b32_e32 v15, v15, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_or_b32_e32 v0, v0, v55 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 @@ -82577,332 +82533,389 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v60 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v62 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v63 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v59 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v61 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: .LBB106_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v30 -; SI-NEXT: v_or_b32_e32 v7, v56, v7 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v11, v1, v11 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v45 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v7, v3, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v4, v47, v4 -; SI-NEXT: v_or_b32_e32 v5, v43, v5 -; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v4 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v9, v35, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v0, v0, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v10, v7 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_or_b32_e32 v23, v57, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v12, v7 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v20, v7 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v5, v40, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_or_b32_e32 v28, v31, v28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v63, v7 -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v7, v58, v7 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v22, v59, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v24, v7 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v40, v7 +; SI-NEXT: v_or_b32_e32 v7, v59, v7 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v26, v24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v62, v7 -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v7, v57, v7 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v26, v58, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v61, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -82927,11 +82940,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v16, v32 ; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v22, v38 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v24, v48 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v26, v50 ; SI-NEXT: v_mov_b32_e32 v28, v52 ; SI-NEXT: v_mov_b32_e32 v30, v54 @@ -82973,78 +82984,78 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v29 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v20 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -83056,79 +83067,80 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -83145,23 +83157,23 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v55, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -83195,98 +83207,92 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: .LBB106_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB106_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_add_u16_e32 v0, 3, v38 -; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_add_u16_e32 v0, 3, v16 ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 -; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v43 -; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v0, 3, v22 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v0, 3, v36 -; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v46 ; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v55 -; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_add_u16_e32 v0, 3, v44 +; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v48 -; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v29 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v49 -; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v1, 3, v61 -; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v63 +; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v16, v12 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v13, v16, v13 -; VI-NEXT: v_or_b32_e32 v14, v14, v26 -; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: v_or_b32_e32 v14, v14, v24 +; VI-NEXT: v_or_b32_e32 v15, v15, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v29, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -83319,54 +83325,58 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v6, v17, v6 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 ; VI-NEXT: v_or_b32_e32 v7, v17, v7 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v8, v17, v8 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 -; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v18 ; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v12, v17, v12 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v13, v17, v13 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 ; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 @@ -83426,99 +83436,100 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v29 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v20 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v22 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload @@ -83526,108 +83537,107 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v47, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v35, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -83661,111 +83671,110 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: .LBB106_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB106_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 ; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v47 ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 -; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 -; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 -; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 -; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 -; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v24, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v22, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v26, v7, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -83787,7 +83796,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -83798,40 +83807,40 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 ; GFX9-NEXT: .LBB106_4: ; %end @@ -93581,129 +93590,123 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 @@ -93713,7 +93716,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_2 @@ -93729,7 +93732,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload @@ -93740,7 +93743,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload @@ -93763,7 +93766,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_or_b32_e32 v33, v7, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -93795,105 +93798,101 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v11, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v11, v2, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v32, v10, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v13, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v15, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; SI-NEXT: v_or_b32_e32 v32, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v13, v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v15, v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v36, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; SI-NEXT: v_or_b32_e32 v37, v22, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v37, v26, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v0, v0, v8 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v19, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v28 ; SI-NEXT: v_or_b32_e32 v48, v1, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v57 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v21, v30, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v21, v18, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v23, v25, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v23, v8, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v59 ; SI-NEXT: v_or_b32_e32 v52, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v53, v16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v53, v12, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v27, v45, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v27, v46, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v30 ; SI-NEXT: v_or_b32_e32 v40, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v29, v56, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v29, v58, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v31, v61, v0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -93955,177 +93954,183 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: .LBB110_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: v_or_b32_e32 v5, v61, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; SI-NEXT: v_or_b32_e32 v3, v47, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v45, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_or_b32_e32 v5, v46, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v62 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v20 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v5, v18, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v26, v5 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v14, v3 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -94138,50 +94143,50 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -94196,7 +94201,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 @@ -94253,21 +94258,21 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v2, v43 ; SI-NEXT: v_mov_b32_e32 v10, v41 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: v_mov_b32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -94293,7 +94298,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v12, v32 ; SI-NEXT: v_mov_b32_e32 v14, v34 ; SI-NEXT: v_mov_b32_e32 v16, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v17, v37 ; SI-NEXT: v_mov_b32_e32 v18, v38 ; SI-NEXT: v_mov_b32_e32 v20, v48 @@ -94339,78 +94343,78 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v29 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 -; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 -; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 -; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v20 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -94422,79 +94426,80 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94511,23 +94516,23 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v55, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -94561,98 +94566,92 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: .LBB110_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB110_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_add_u16_e32 v0, 3, v38 -; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_add_u16_e32 v0, 3, v16 ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 -; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v43 -; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v0, 3, v22 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v0, 3, v36 -; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v46 ; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v55 -; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_add_u16_e32 v0, 3, v44 +; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v48 -; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v29 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v49 -; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v1, 3, v61 -; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v63 +; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v12, v16, v12 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v13, v16, v13 -; VI-NEXT: v_or_b32_e32 v14, v14, v26 -; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: v_or_b32_e32 v14, v14, v24 +; VI-NEXT: v_or_b32_e32 v15, v15, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v29, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -94685,54 +94684,58 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v6, v17, v6 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 ; VI-NEXT: v_or_b32_e32 v7, v17, v7 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v27 ; VI-NEXT: v_or_b32_e32 v8, v17, v8 -; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 -; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 ; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v18 ; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v12, v17, v12 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v13, v17, v13 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 ; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 @@ -94792,99 +94795,100 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v29 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v20 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v22 ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload @@ -94892,108 +94896,107 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v47, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v35, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -95027,111 +95030,110 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: .LBB110_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB110_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 ; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v47 ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 -; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 -; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 -; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 -; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 -; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 -; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v24, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v22, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v26, v7, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -95153,7 +95155,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -95164,40 +95166,40 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 ; GFX9-NEXT: .LBB110_4: ; %end diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index dda05a8897979..3a26a5c263d78 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -3547,8 +3547,8 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -3580,9 +3580,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6864,6 +6864,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 @@ -6887,13 +6888,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -6905,7 +6905,6 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 @@ -11192,8 +11191,8 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -11225,9 +11224,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14548,6 +14547,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 @@ -14571,13 +14571,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -14589,7 +14588,6 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 @@ -18055,8 +18053,8 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -18088,9 +18086,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -21382,6 +21380,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 @@ -21405,13 +21404,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -21423,7 +21421,6 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 @@ -24140,8 +24137,8 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v36, v22 @@ -24173,9 +24170,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27415,6 +27412,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 @@ -27438,13 +27436,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -27456,7 +27453,6 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 @@ -28860,13 +28856,13 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 @@ -28911,7 +28907,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28939,37 +28935,37 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -29001,12 +28997,12 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB56_2: ; %Flow @@ -29024,36 +29020,36 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 @@ -29102,7 +29098,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: .LBB56_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 4a8000ba52752..cc55ba1d84df6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3698,26 +3698,24 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 @@ -3732,23 +3730,25 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -3761,14 +3761,14 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -3795,8 +3795,8 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: v_or_b32_e32 v4, v4, v40 ; SI-NEXT: v_or_b32_e32 v5, v5, v34 @@ -3822,8 +3822,8 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -3836,19 +3836,19 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 ; SI-NEXT: v_or_b32_e32 v12, v12, v59 ; SI-NEXT: v_or_b32_e32 v13, v13, v58 @@ -3870,14 +3870,14 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -3904,8 +3904,8 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v40, v4 ; SI-NEXT: v_or_b32_e32 v5, v34, v5 @@ -3929,15 +3929,13 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 @@ -7274,6 +7272,8 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -7362,58 +7362,55 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -11800,26 +11797,24 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 @@ -11834,23 +11829,25 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11863,14 +11860,14 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -11897,8 +11894,8 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: v_or_b32_e32 v4, v4, v40 ; SI-NEXT: v_or_b32_e32 v5, v5, v34 @@ -11924,8 +11921,8 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -11938,19 +11935,19 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 ; SI-NEXT: v_or_b32_e32 v12, v12, v59 ; SI-NEXT: v_or_b32_e32 v13, v13, v58 @@ -11972,14 +11969,14 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -12006,8 +12003,8 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v40, v4 ; SI-NEXT: v_or_b32_e32 v5, v34, v5 @@ -12031,15 +12028,13 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 @@ -15364,6 +15359,8 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -15452,58 +15449,55 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -19214,26 +19208,24 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 @@ -19248,23 +19240,25 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -19277,14 +19271,14 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -19311,8 +19305,8 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: v_or_b32_e32 v4, v4, v40 ; SI-NEXT: v_or_b32_e32 v5, v5, v34 @@ -19338,8 +19332,8 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -19352,19 +19346,19 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 ; SI-NEXT: v_or_b32_e32 v12, v12, v59 ; SI-NEXT: v_or_b32_e32 v13, v13, v58 @@ -19386,14 +19380,14 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -19420,8 +19414,8 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v40, v4 ; SI-NEXT: v_or_b32_e32 v5, v34, v5 @@ -19445,15 +19439,13 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 @@ -22800,6 +22792,8 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -22888,58 +22882,55 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -25855,26 +25846,24 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 @@ -25889,23 +25878,25 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -25918,14 +25909,14 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 @@ -25952,8 +25943,8 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_or_b32_e32 v3, v3, v35 ; SI-NEXT: v_or_b32_e32 v4, v4, v40 ; SI-NEXT: v_or_b32_e32 v5, v5, v34 @@ -25979,8 +25970,8 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 @@ -25993,19 +25984,19 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v60 ; SI-NEXT: v_or_b32_e32 v12, v12, v59 ; SI-NEXT: v_or_b32_e32 v13, v13, v58 @@ -26027,14 +26018,14 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -26061,8 +26052,8 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_or_b32_e32 v4, v40, v4 ; SI-NEXT: v_or_b32_e32 v5, v34, v5 @@ -26086,15 +26077,13 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 @@ -29351,6 +29340,8 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -29439,58 +29430,55 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 4141c33eca786..2a96722ccce0b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3960,93 +3960,98 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -4075,20 +4080,19 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v43 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v36 ; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -4098,63 +4102,63 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v10, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 ; SI-NEXT: v_or_b32_e32 v12, v12, v63 ; SI-NEXT: v_or_b32_e32 v13, v13, v62 ; SI-NEXT: v_or_b32_e32 v14, v14, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 ; SI-NEXT: v_or_b32_e32 v17, v17, v58 ; SI-NEXT: v_or_b32_e32 v18, v18, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v56 ; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 @@ -4164,18 +4168,19 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -4185,8 +4190,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -4196,18 +4200,18 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -4217,58 +4221,57 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v12, v63, v12 ; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_or_b32_e32 v14, v61, v14 ; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 ; SI-NEXT: v_or_b32_e32 v17, v58, v17 ; SI-NEXT: v_or_b32_e32 v18, v57, v18 ; SI-NEXT: v_or_b32_e32 v19, v56, v19 ; SI-NEXT: v_or_b32_e32 v20, v47, v20 ; SI-NEXT: v_or_b32_e32 v21, v46, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 @@ -7996,73 +7999,72 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -12848,93 +12850,98 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -12963,20 +12970,19 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v43 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v36 ; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -12986,63 +12992,63 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v10, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 ; SI-NEXT: v_or_b32_e32 v12, v12, v63 ; SI-NEXT: v_or_b32_e32 v13, v13, v62 ; SI-NEXT: v_or_b32_e32 v14, v14, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 ; SI-NEXT: v_or_b32_e32 v17, v17, v58 ; SI-NEXT: v_or_b32_e32 v18, v18, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v56 ; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 @@ -13052,18 +13058,19 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -13073,8 +13080,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -13084,18 +13090,18 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -13105,58 +13111,57 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v12, v63, v12 ; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_or_b32_e32 v14, v61, v14 ; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 ; SI-NEXT: v_or_b32_e32 v17, v58, v17 ; SI-NEXT: v_or_b32_e32 v18, v57, v18 ; SI-NEXT: v_or_b32_e32 v19, v56, v19 ; SI-NEXT: v_or_b32_e32 v20, v47, v20 ; SI-NEXT: v_or_b32_e32 v21, v46, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 @@ -16884,73 +16889,72 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -21026,93 +21030,98 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -21141,20 +21150,19 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v43 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v36 ; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -21164,63 +21172,63 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v10, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 ; SI-NEXT: v_or_b32_e32 v12, v12, v63 ; SI-NEXT: v_or_b32_e32 v13, v13, v62 ; SI-NEXT: v_or_b32_e32 v14, v14, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 ; SI-NEXT: v_or_b32_e32 v17, v17, v58 ; SI-NEXT: v_or_b32_e32 v18, v18, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v56 ; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 @@ -21230,18 +21238,19 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -21251,8 +21260,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -21262,18 +21270,18 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -21283,58 +21291,57 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v12, v63, v12 ; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_or_b32_e32 v14, v61, v14 ; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 ; SI-NEXT: v_or_b32_e32 v17, v58, v17 ; SI-NEXT: v_or_b32_e32 v18, v57, v18 ; SI-NEXT: v_or_b32_e32 v19, v56, v19 ; SI-NEXT: v_or_b32_e32 v20, v47, v20 ; SI-NEXT: v_or_b32_e32 v21, v46, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 @@ -25074,73 +25081,72 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -28366,93 +28372,98 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -28481,20 +28492,19 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v43 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v36 ; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v6, v6, v35 -; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v35 ; SI-NEXT: v_or_b32_e32 v8, v8, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -28504,63 +28514,63 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v10, v10, v33 ; SI-NEXT: v_or_b32_e32 v11, v11, v32 ; SI-NEXT: v_or_b32_e32 v12, v12, v63 ; SI-NEXT: v_or_b32_e32 v13, v13, v62 ; SI-NEXT: v_or_b32_e32 v14, v14, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 ; SI-NEXT: v_or_b32_e32 v17, v17, v58 ; SI-NEXT: v_or_b32_e32 v18, v18, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v56 ; SI-NEXT: v_or_b32_e32 v20, v20, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v46 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 @@ -28570,18 +28580,19 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -28591,8 +28602,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -28602,18 +28612,18 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 ; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 -; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v6, v41, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v8, v34, v8 -; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -28623,58 +28633,57 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v9, v40, v9 ; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v12, v63, v12 ; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_or_b32_e32 v14, v61, v14 ; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 ; SI-NEXT: v_or_b32_e32 v17, v58, v17 ; SI-NEXT: v_or_b32_e32 v18, v57, v18 ; SI-NEXT: v_or_b32_e32 v19, v56, v19 ; SI-NEXT: v_or_b32_e32 v20, v47, v20 ; SI-NEXT: v_or_b32_e32 v21, v46, v21 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 @@ -32325,73 +32334,72 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -36331,162 +36339,157 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v44f16_to_v44i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v44 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v45 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v23, v46 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v22, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v27 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v1, v1, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v3, v3, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v8, v8, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 -; SI-NEXT: v_or_b32_e32 v11, v11, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 @@ -36533,30 +36536,30 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_or_b32_e32 v48, v26, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v51, v27, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 ; SI-NEXT: v_or_b32_e32 v50, v29, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 @@ -36570,29 +36573,29 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v36, v30, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 @@ -36601,25 +36604,25 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v33, v33, v30 ; SI-NEXT: v_or_b32_e32 v21, v21, v52 -; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_or_b32_e32 v17, v17, v25 ; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v23 -; SI-NEXT: v_or_b32_e32 v13, v13, v22 -; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_or_b32_e32 v11, v11, v18 ; SI-NEXT: v_alignbit_b32 v41, v48, v26, 16 ; SI-NEXT: v_alignbit_b32 v40, v37, v27, 16 ; SI-NEXT: v_alignbit_b32 v55, v34, v28, 16 ; SI-NEXT: v_alignbit_b32 v54, v31, v29, 16 ; SI-NEXT: v_alignbit_b32 v53, v19, v30, 16 -; SI-NEXT: v_alignbit_b32 v52, v16, v52, 16 -; SI-NEXT: v_alignbit_b32 v25, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v8, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v3, v23, 16 -; SI-NEXT: v_alignbit_b32 v22, v5, v22, 16 -; SI-NEXT: v_alignbit_b32 v15, v1, v15, 16 +; SI-NEXT: v_alignbit_b32 v52, v15, v52, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v24, 16 +; SI-NEXT: v_alignbit_b32 v23, v1, v23, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v22, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v18, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 @@ -36682,85 +36685,81 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v23 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 7ec521c4ed8f7..a2bd1d30cc634 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -4207,21 +4207,27 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -4230,25 +4236,15 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -4276,22 +4272,26 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -4301,23 +4301,23 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -4384,37 +4384,37 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v8, v8, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v41 @@ -4429,8 +4429,8 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v21, v58 ; SI-NEXT: v_or_b32_e32 v22, v22, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -4449,23 +4449,23 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -4501,24 +4501,23 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -4540,8 +4539,8 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 ; SI-NEXT: v_or_b32_e32 v11, v34, v11 ; SI-NEXT: v_or_b32_e32 v12, v41, v12 @@ -8770,15 +8769,15 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -8805,7 +8804,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -9733,8 +9732,8 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 @@ -9801,14 +9800,14 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -13996,21 +13995,27 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -14019,25 +14024,15 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -14065,22 +14060,26 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -14090,23 +14089,23 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -14173,37 +14172,37 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v8, v8, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v41 @@ -14218,8 +14217,8 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v21, v58 ; SI-NEXT: v_or_b32_e32 v22, v22, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -14238,23 +14237,23 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -14290,24 +14289,23 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -14329,8 +14327,8 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 ; SI-NEXT: v_or_b32_e32 v11, v34, v11 ; SI-NEXT: v_or_b32_e32 v12, v41, v12 @@ -18555,15 +18553,15 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -18590,7 +18588,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -19518,8 +19516,8 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 @@ -19586,14 +19584,14 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -23044,21 +23042,27 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -23067,25 +23071,15 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -23113,22 +23107,26 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -23138,23 +23136,23 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -23221,37 +23219,37 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v8, v8, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v41 @@ -23266,8 +23264,8 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v21, v58 ; SI-NEXT: v_or_b32_e32 v22, v22, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -23286,23 +23284,23 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -23338,24 +23336,23 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -23377,8 +23374,8 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 ; SI-NEXT: v_or_b32_e32 v11, v34, v11 ; SI-NEXT: v_or_b32_e32 v12, v41, v12 @@ -27619,15 +27616,15 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -27654,7 +27651,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -28582,8 +28579,8 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 @@ -28650,14 +28647,14 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -31213,21 +31210,27 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v48, v14 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v50, v10 ; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v52, v6 ; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -31236,25 +31239,15 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -31282,22 +31275,26 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -31307,23 +31304,23 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -31390,37 +31387,37 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v8, v8, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 ; SI-NEXT: v_or_b32_e32 v10, v10, v42 ; SI-NEXT: v_or_b32_e32 v11, v11, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v41 @@ -31435,8 +31432,8 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v21, v21, v58 ; SI-NEXT: v_or_b32_e32 v22, v22, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v56 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -31455,23 +31452,23 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -31507,24 +31504,23 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 @@ -31546,8 +31542,8 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v8, v35, v8 -; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v8, v43, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 ; SI-NEXT: v_or_b32_e32 v10, v42, v10 ; SI-NEXT: v_or_b32_e32 v11, v34, v11 ; SI-NEXT: v_or_b32_e32 v12, v41, v12 @@ -35690,15 +35686,15 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -35725,7 +35721,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -36653,8 +36649,8 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 @@ -36721,14 +36717,14 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 028e61a1ef687..fbee320c82c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -4507,73 +4507,69 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -4584,60 +4580,64 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 @@ -4647,11 +4647,11 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v3, v3, v39 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -4661,27 +4661,25 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -4718,34 +4716,37 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v45 -; SI-NEXT: v_or_b32_e32 v12, v12, v44 -; SI-NEXT: v_or_b32_e32 v13, v13, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v43 -; SI-NEXT: v_or_b32_e32 v15, v15, v42 -; SI-NEXT: v_or_b32_e32 v16, v16, v33 -; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v46 +; SI-NEXT: v_or_b32_e32 v9, v9, v45 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v42 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 ; SI-NEXT: v_or_b32_e32 v18, v18, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v32 ; SI-NEXT: v_or_b32_e32 v20, v20, v63 @@ -4755,16 +4756,16 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -4778,27 +4779,27 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -4816,11 +4817,11 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v47, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -4830,30 +4831,33 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -4875,16 +4879,16 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v36, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v35, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_or_b32_e32 v14, v43, v14 -; SI-NEXT: v_or_b32_e32 v15, v42, v15 -; SI-NEXT: v_or_b32_e32 v16, v33, v16 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v33, v17 ; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v19, v32, v19 ; SI-NEXT: v_or_b32_e32 v20, v63, v20 @@ -9474,50 +9478,50 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -9525,50 +9529,50 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -10544,7 +10548,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 @@ -10590,8 +10596,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v18, v33, v18 ; SI-NEXT: v_or_b32_e32 v19, v59, v19 ; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -15201,73 +15205,69 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -15278,60 +15278,64 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 @@ -15341,11 +15345,11 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v3, v3, v39 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -15355,27 +15359,25 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -15412,34 +15414,37 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v45 -; SI-NEXT: v_or_b32_e32 v12, v12, v44 -; SI-NEXT: v_or_b32_e32 v13, v13, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v43 -; SI-NEXT: v_or_b32_e32 v15, v15, v42 -; SI-NEXT: v_or_b32_e32 v16, v16, v33 -; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v46 +; SI-NEXT: v_or_b32_e32 v9, v9, v45 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v42 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 ; SI-NEXT: v_or_b32_e32 v18, v18, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v32 ; SI-NEXT: v_or_b32_e32 v20, v20, v63 @@ -15449,16 +15454,16 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -15472,27 +15477,27 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -15510,11 +15515,11 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v47, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -15524,30 +15529,33 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -15569,16 +15577,16 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v36, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v35, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_or_b32_e32 v14, v43, v14 -; SI-NEXT: v_or_b32_e32 v15, v42, v15 -; SI-NEXT: v_or_b32_e32 v16, v33, v16 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v33, v17 ; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v19, v32, v19 ; SI-NEXT: v_or_b32_e32 v20, v63, v20 @@ -20169,50 +20177,50 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -20220,50 +20228,50 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -21239,7 +21247,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 @@ -21285,8 +21295,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v18, v33, v18 ; SI-NEXT: v_or_b32_e32 v19, v59, v19 ; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -25096,73 +25104,69 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -25173,60 +25177,64 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 @@ -25236,11 +25244,11 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v3, v3, v39 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -25250,27 +25258,25 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -25307,34 +25313,37 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v45 -; SI-NEXT: v_or_b32_e32 v12, v12, v44 -; SI-NEXT: v_or_b32_e32 v13, v13, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v43 -; SI-NEXT: v_or_b32_e32 v15, v15, v42 -; SI-NEXT: v_or_b32_e32 v16, v16, v33 -; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v46 +; SI-NEXT: v_or_b32_e32 v9, v9, v45 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v42 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 ; SI-NEXT: v_or_b32_e32 v18, v18, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v32 ; SI-NEXT: v_or_b32_e32 v20, v20, v63 @@ -25344,16 +25353,16 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -25367,27 +25376,27 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -25405,11 +25414,11 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v47, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -25419,30 +25428,33 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -25464,16 +25476,16 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v36, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v35, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_or_b32_e32 v14, v43, v14 -; SI-NEXT: v_or_b32_e32 v15, v42, v15 -; SI-NEXT: v_or_b32_e32 v16, v33, v16 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v33, v17 ; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v19, v32, v19 ; SI-NEXT: v_or_b32_e32 v20, v63, v20 @@ -30078,50 +30090,50 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -30129,50 +30141,50 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -31148,7 +31160,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 @@ -31194,8 +31208,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v18, v33, v18 ; SI-NEXT: v_or_b32_e32 v19, v59, v19 ; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -34085,73 +34097,69 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -34162,60 +34170,64 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 @@ -34225,11 +34237,11 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v57 -; SI-NEXT: v_or_b32_e32 v3, v3, v39 -; SI-NEXT: v_or_b32_e32 v4, v4, v56 -; SI-NEXT: v_or_b32_e32 v5, v5, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -34239,27 +34251,25 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 @@ -34296,34 +34306,37 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v36 -; SI-NEXT: v_or_b32_e32 v9, v9, v46 -; SI-NEXT: v_or_b32_e32 v10, v10, v35 -; SI-NEXT: v_or_b32_e32 v11, v11, v45 -; SI-NEXT: v_or_b32_e32 v12, v12, v44 -; SI-NEXT: v_or_b32_e32 v13, v13, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v43 -; SI-NEXT: v_or_b32_e32 v15, v15, v42 -; SI-NEXT: v_or_b32_e32 v16, v16, v33 -; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v46 +; SI-NEXT: v_or_b32_e32 v9, v9, v45 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v42 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 ; SI-NEXT: v_or_b32_e32 v18, v18, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v32 ; SI-NEXT: v_or_b32_e32 v20, v20, v63 @@ -34333,16 +34346,16 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v24, v24, v59 ; SI-NEXT: ; kill: killed $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -34356,27 +34369,27 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -34394,11 +34407,11 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v2, v57, v2 -; SI-NEXT: v_or_b32_e32 v3, v39, v3 -; SI-NEXT: v_or_b32_e32 v4, v56, v4 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 -; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v47, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -34408,30 +34421,33 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 @@ -34453,16 +34469,16 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v36, v8 -; SI-NEXT: v_or_b32_e32 v9, v46, v9 -; SI-NEXT: v_or_b32_e32 v10, v35, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v44, v12 -; SI-NEXT: v_or_b32_e32 v13, v34, v13 -; SI-NEXT: v_or_b32_e32 v14, v43, v14 -; SI-NEXT: v_or_b32_e32 v15, v42, v15 -; SI-NEXT: v_or_b32_e32 v16, v33, v16 -; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v35, v13 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v33, v17 ; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v19, v32, v19 ; SI-NEXT: v_or_b32_e32 v20, v63, v20 @@ -38962,50 +38978,50 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -39013,50 +39029,50 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -40032,7 +40048,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 @@ -40078,8 +40096,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v18, v33, v18 ; SI-NEXT: v_or_b32_e32 v19, v59, v19 ; SI-NEXT: v_or_b32_e32 v20, v27, v20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(4) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index f3ef202a22f31..462e50ac8412c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -4831,152 +4831,152 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -5026,9 +5026,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v58 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 ; SI-NEXT: v_or_b32_e32 v16, v16, v49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -5037,125 +5037,125 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v45 -; SI-NEXT: v_or_b32_e32 v9, v9, v44 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v33 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v56 -; SI-NEXT: v_or_b32_e32 v18, v18, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v38 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v43 -; SI-NEXT: v_or_b32_e32 v23, v23, v42 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v47 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v63 +; SI-NEXT: v_or_b32_e32 v25, v25, v62 +; SI-NEXT: v_or_b32_e32 v26, v26, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -5164,10 +5164,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 ; SI-NEXT: v_or_b32_e32 v16, v49, v16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -5177,35 +5177,37 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -5231,29 +5233,29 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v34, v10 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v12, v32, v12 -; SI-NEXT: v_or_b32_e32 v13, v63, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v50, v15 -; SI-NEXT: v_or_b32_e32 v17, v56, v17 -; SI-NEXT: v_or_b32_e32 v18, v47, v18 -; SI-NEXT: v_or_b32_e32 v19, v38, v19 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v42, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v47, v15 +; SI-NEXT: v_or_b32_e32 v17, v48, v17 +; SI-NEXT: v_or_b32_e32 v18, v38, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v21, v33, v21 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_or_b32_e32 v25, v62, v25 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 ; SI-NEXT: v_or_b32_e32 v27, v60, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 @@ -11430,12 +11432,12 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -11485,7 +11487,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -11584,21 +11586,21 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -11611,37 +11613,37 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -16445,152 +16447,152 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -16640,9 +16642,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v58 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 ; SI-NEXT: v_or_b32_e32 v16, v16, v49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -16651,125 +16653,125 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v45 -; SI-NEXT: v_or_b32_e32 v9, v9, v44 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v33 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v56 -; SI-NEXT: v_or_b32_e32 v18, v18, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v38 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v43 -; SI-NEXT: v_or_b32_e32 v23, v23, v42 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v47 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v63 +; SI-NEXT: v_or_b32_e32 v25, v25, v62 +; SI-NEXT: v_or_b32_e32 v26, v26, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB30_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -16778,10 +16780,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 ; SI-NEXT: v_or_b32_e32 v16, v49, v16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -16791,35 +16793,37 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -16845,29 +16849,29 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v34, v10 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v12, v32, v12 -; SI-NEXT: v_or_b32_e32 v13, v63, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v50, v15 -; SI-NEXT: v_or_b32_e32 v17, v56, v17 -; SI-NEXT: v_or_b32_e32 v18, v47, v18 -; SI-NEXT: v_or_b32_e32 v19, v38, v19 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v42, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v47, v15 +; SI-NEXT: v_or_b32_e32 v17, v48, v17 +; SI-NEXT: v_or_b32_e32 v18, v38, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v21, v33, v21 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_or_b32_e32 v25, v62, v25 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 ; SI-NEXT: v_or_b32_e32 v27, v60, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 @@ -23034,12 +23038,12 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -23089,7 +23093,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -23188,21 +23192,21 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -23215,37 +23219,37 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -27200,152 +27204,152 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -27395,9 +27399,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v58 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 ; SI-NEXT: v_or_b32_e32 v16, v16, v49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -27406,125 +27410,125 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v45 -; SI-NEXT: v_or_b32_e32 v9, v9, v44 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v33 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v56 -; SI-NEXT: v_or_b32_e32 v18, v18, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v38 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v43 -; SI-NEXT: v_or_b32_e32 v23, v23, v42 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v47 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v63 +; SI-NEXT: v_or_b32_e32 v25, v25, v62 +; SI-NEXT: v_or_b32_e32 v26, v26, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -27533,10 +27537,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 ; SI-NEXT: v_or_b32_e32 v16, v49, v16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -27546,35 +27550,37 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -27600,29 +27606,29 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v34, v10 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v12, v32, v12 -; SI-NEXT: v_or_b32_e32 v13, v63, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v50, v15 -; SI-NEXT: v_or_b32_e32 v17, v56, v17 -; SI-NEXT: v_or_b32_e32 v18, v47, v18 -; SI-NEXT: v_or_b32_e32 v19, v38, v19 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v42, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v47, v15 +; SI-NEXT: v_or_b32_e32 v17, v48, v17 +; SI-NEXT: v_or_b32_e32 v18, v38, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v21, v33, v21 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_or_b32_e32 v25, v62, v25 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 ; SI-NEXT: v_or_b32_e32 v27, v60, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 @@ -33813,12 +33819,12 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -33868,7 +33874,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -33967,21 +33973,21 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -33994,37 +34000,37 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -37005,152 +37011,152 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v2 ; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 @@ -37200,9 +37206,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v58 -; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 ; SI-NEXT: v_or_b32_e32 v16, v16, v49 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -37211,125 +37217,125 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v3, v3, v57 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v46 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_or_b32_e32 v8, v8, v45 -; SI-NEXT: v_or_b32_e32 v9, v9, v44 -; SI-NEXT: v_or_b32_e32 v10, v10, v34 -; SI-NEXT: v_or_b32_e32 v11, v11, v33 -; SI-NEXT: v_or_b32_e32 v12, v12, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v61 -; SI-NEXT: v_or_b32_e32 v15, v15, v50 -; SI-NEXT: v_or_b32_e32 v17, v17, v56 -; SI-NEXT: v_or_b32_e32 v18, v18, v47 -; SI-NEXT: v_or_b32_e32 v19, v19, v38 -; SI-NEXT: v_or_b32_e32 v20, v20, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v22, v22, v43 -; SI-NEXT: v_or_b32_e32 v23, v23, v42 -; SI-NEXT: v_or_b32_e32 v24, v24, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v7, v7, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v43 +; SI-NEXT: v_or_b32_e32 v13, v13, v42 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v47 +; SI-NEXT: v_or_b32_e32 v17, v17, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: v_or_b32_e32 v20, v20, v35 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_or_b32_e32 v23, v23, v41 +; SI-NEXT: v_or_b32_e32 v24, v24, v63 +; SI-NEXT: v_or_b32_e32 v25, v25, v62 +; SI-NEXT: v_or_b32_e32 v26, v26, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 @@ -37338,10 +37344,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 ; SI-NEXT: v_or_b32_e32 v16, v49, v16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -37351,35 +37357,37 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -37405,29 +37413,29 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v3, v57, v3 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: v_or_b32_e32 v8, v45, v8 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_or_b32_e32 v10, v34, v10 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v12, v32, v12 -; SI-NEXT: v_or_b32_e32 v13, v63, v13 -; SI-NEXT: v_or_b32_e32 v14, v61, v14 -; SI-NEXT: v_or_b32_e32 v15, v50, v15 -; SI-NEXT: v_or_b32_e32 v17, v56, v17 -; SI-NEXT: v_or_b32_e32 v18, v47, v18 -; SI-NEXT: v_or_b32_e32 v19, v38, v19 -; SI-NEXT: v_or_b32_e32 v20, v36, v20 -; SI-NEXT: v_or_b32_e32 v21, v35, v21 -; SI-NEXT: v_or_b32_e32 v22, v43, v22 -; SI-NEXT: v_or_b32_e32 v23, v42, v23 -; SI-NEXT: v_or_b32_e32 v24, v41, v24 -; SI-NEXT: v_or_b32_e32 v25, v40, v25 -; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v36, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v43, v12 +; SI-NEXT: v_or_b32_e32 v13, v42, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v47, v15 +; SI-NEXT: v_or_b32_e32 v17, v48, v17 +; SI-NEXT: v_or_b32_e32 v18, v38, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_or_b32_e32 v20, v35, v20 +; SI-NEXT: v_or_b32_e32 v21, v33, v21 +; SI-NEXT: v_or_b32_e32 v22, v32, v22 +; SI-NEXT: v_or_b32_e32 v23, v41, v23 +; SI-NEXT: v_or_b32_e32 v24, v63, v24 +; SI-NEXT: v_or_b32_e32 v25, v62, v25 +; SI-NEXT: v_or_b32_e32 v26, v61, v26 ; SI-NEXT: v_or_b32_e32 v27, v60, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 @@ -43505,12 +43513,12 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -43560,7 +43568,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -43659,21 +43667,21 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -43686,37 +43694,37 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 0a7790a27f5ae..94ed6276bd051 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -5148,167 +5148,167 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -5365,111 +5365,111 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_or_b32_e32 v4, v4, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v59 -; SI-NEXT: v_or_b32_e32 v6, v6, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v49 -; SI-NEXT: v_or_b32_e32 v8, v8, v48 -; SI-NEXT: v_or_b32_e32 v9, v9, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v50 +; SI-NEXT: v_or_b32_e32 v10, v10, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v39 ; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v38 ; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v42 -; SI-NEXT: v_or_b32_e32 v20, v20, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v40 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v43 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v33 +; SI-NEXT: v_or_b32_e32 v26, v26, v40 ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -5477,79 +5477,75 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -5583,31 +5579,31 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_or_b32_e32 v9, v58, v9 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v39, v13 ; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v38, v15 -; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_or_b32_e32 v16, v38, v16 ; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v42, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_or_b32_e32 v20, v43, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v34, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v33, v25 +; SI-NEXT: v_or_b32_e32 v26, v40, v26 ; SI-NEXT: v_or_b32_e32 v27, v32, v27 ; SI-NEXT: v_or_b32_e32 v28, v63, v28 ; SI-NEXT: v_or_b32_e32 v29, v62, v29 @@ -10599,17 +10595,17 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -10714,35 +10710,35 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 @@ -12312,21 +12308,26 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -12366,11 +12367,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v22, v30, v22 ; SI-NEXT: v_or_b32_e32 v23, v31, v23 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -12503,9 +12499,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -12514,8 +12510,8 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -12530,38 +12526,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -17703,167 +17699,167 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -17920,111 +17916,111 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_or_b32_e32 v4, v4, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v59 -; SI-NEXT: v_or_b32_e32 v6, v6, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v49 -; SI-NEXT: v_or_b32_e32 v8, v8, v48 -; SI-NEXT: v_or_b32_e32 v9, v9, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v50 +; SI-NEXT: v_or_b32_e32 v10, v10, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v39 ; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v38 ; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v42 -; SI-NEXT: v_or_b32_e32 v20, v20, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v40 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v43 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v33 +; SI-NEXT: v_or_b32_e32 v26, v26, v40 ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -18032,79 +18028,75 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB30_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -18138,31 +18130,31 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_or_b32_e32 v9, v58, v9 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v39, v13 ; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v38, v15 -; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_or_b32_e32 v16, v38, v16 ; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v42, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_or_b32_e32 v20, v43, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v34, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v33, v25 +; SI-NEXT: v_or_b32_e32 v26, v40, v26 ; SI-NEXT: v_or_b32_e32 v27, v32, v27 ; SI-NEXT: v_or_b32_e32 v28, v63, v28 ; SI-NEXT: v_or_b32_e32 v29, v62, v29 @@ -23136,17 +23128,17 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -23251,35 +23243,35 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 @@ -24849,21 +24841,26 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -24903,11 +24900,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v22, v30, v22 ; SI-NEXT: v_or_b32_e32 v23, v31, v23 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -25040,9 +25032,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -25051,8 +25043,8 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -25067,38 +25059,38 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -29360,167 +29352,167 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -29577,111 +29569,111 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_or_b32_e32 v4, v4, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v59 -; SI-NEXT: v_or_b32_e32 v6, v6, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v49 -; SI-NEXT: v_or_b32_e32 v8, v8, v48 -; SI-NEXT: v_or_b32_e32 v9, v9, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v50 +; SI-NEXT: v_or_b32_e32 v10, v10, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v39 ; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v38 ; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v42 -; SI-NEXT: v_or_b32_e32 v20, v20, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v40 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v43 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v33 +; SI-NEXT: v_or_b32_e32 v26, v26, v40 ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -29689,79 +29681,75 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -29795,31 +29783,31 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_or_b32_e32 v9, v58, v9 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v39, v13 ; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v38, v15 -; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_or_b32_e32 v16, v38, v16 ; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v42, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_or_b32_e32 v20, v43, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v34, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v33, v25 +; SI-NEXT: v_or_b32_e32 v26, v40, v26 ; SI-NEXT: v_or_b32_e32 v27, v32, v27 ; SI-NEXT: v_or_b32_e32 v28, v63, v28 ; SI-NEXT: v_or_b32_e32 v29, v62, v29 @@ -34828,17 +34816,17 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -34943,35 +34931,35 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 @@ -36541,21 +36529,26 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -36595,11 +36588,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v22, v30, v22 ; SI-NEXT: v_or_b32_e32 v23, v31, v23 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -36732,9 +36720,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -36743,8 +36731,8 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -36759,38 +36747,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -40020,167 +40008,167 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v55, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -40237,111 +40225,111 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v52 -; SI-NEXT: v_or_b32_e32 v4, v4, v51 -; SI-NEXT: v_or_b32_e32 v5, v5, v59 -; SI-NEXT: v_or_b32_e32 v6, v6, v50 -; SI-NEXT: v_or_b32_e32 v7, v7, v49 -; SI-NEXT: v_or_b32_e32 v8, v8, v48 -; SI-NEXT: v_or_b32_e32 v9, v9, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v39 -; SI-NEXT: v_or_b32_e32 v12, v12, v56 -; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 +; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v50 +; SI-NEXT: v_or_b32_e32 v10, v10, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v39 ; SI-NEXT: v_or_b32_e32 v14, v14, v46 -; SI-NEXT: v_or_b32_e32 v15, v15, v38 -; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v45 +; SI-NEXT: v_or_b32_e32 v16, v16, v38 ; SI-NEXT: v_or_b32_e32 v17, v17, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v42 -; SI-NEXT: v_or_b32_e32 v20, v20, v41 -; SI-NEXT: v_or_b32_e32 v21, v21, v40 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 -; SI-NEXT: v_or_b32_e32 v23, v23, v36 -; SI-NEXT: v_or_b32_e32 v24, v24, v35 -; SI-NEXT: v_or_b32_e32 v25, v25, v34 -; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_or_b32_e32 v20, v20, v43 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v34 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v33 +; SI-NEXT: v_or_b32_e32 v26, v26, v40 ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -40349,79 +40337,75 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -40455,31 +40439,31 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 -; SI-NEXT: v_or_b32_e32 v3, v52, v3 -; SI-NEXT: v_or_b32_e32 v4, v51, v4 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v6, v50, v6 -; SI-NEXT: v_or_b32_e32 v7, v49, v7 -; SI-NEXT: v_or_b32_e32 v8, v48, v8 -; SI-NEXT: v_or_b32_e32 v9, v58, v9 -; SI-NEXT: v_or_b32_e32 v10, v57, v10 -; SI-NEXT: v_or_b32_e32 v11, v39, v11 -; SI-NEXT: v_or_b32_e32 v12, v56, v12 -; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v50, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v48, v11 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v39, v13 ; SI-NEXT: v_or_b32_e32 v14, v46, v14 -; SI-NEXT: v_or_b32_e32 v15, v38, v15 -; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 +; SI-NEXT: v_or_b32_e32 v16, v38, v16 ; SI-NEXT: v_or_b32_e32 v17, v44, v17 -; SI-NEXT: v_or_b32_e32 v19, v42, v19 -; SI-NEXT: v_or_b32_e32 v20, v41, v20 -; SI-NEXT: v_or_b32_e32 v21, v40, v21 -; SI-NEXT: v_or_b32_e32 v22, v37, v22 -; SI-NEXT: v_or_b32_e32 v23, v36, v23 -; SI-NEXT: v_or_b32_e32 v24, v35, v24 -; SI-NEXT: v_or_b32_e32 v25, v34, v25 -; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v19, v36, v19 +; SI-NEXT: v_or_b32_e32 v20, v43, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v34, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v33, v25 +; SI-NEXT: v_or_b32_e32 v26, v40, v26 ; SI-NEXT: v_or_b32_e32 v27, v32, v27 ; SI-NEXT: v_or_b32_e32 v28, v63, v28 ; SI-NEXT: v_or_b32_e32 v29, v62, v29 @@ -45357,17 +45341,17 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 @@ -45472,35 +45456,35 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 @@ -47070,21 +47054,26 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -47124,11 +47113,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v22, v30, v22 ; SI-NEXT: v_or_b32_e32 v23, v31, v23 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -47261,9 +47245,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -47272,8 +47256,8 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -47288,38 +47272,38 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51934,502 +51918,496 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v18 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v30 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v41 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v30, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v59 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v47 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v56, v29 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v57, v25 -; SI-NEXT: v_mov_b32_e32 v58, v21 -; SI-NEXT: v_mov_b32_e32 v59, v17 -; SI-NEXT: v_mov_b32_e32 v60, v6 -; SI-NEXT: v_mov_b32_e32 v63, v8 +; SI-NEXT: v_mov_b32_e32 v47, v21 +; SI-NEXT: v_mov_b32_e32 v56, v17 +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: v_mov_b32_e32 v59, v33 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v34 -; SI-NEXT: v_or_b32_e32 v32, v32, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v62 -; SI-NEXT: v_or_b32_e32 v35, v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 +; SI-NEXT: v_or_b32_e32 v63, v6, v34 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v46, v37, v38 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v54, v8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v11, v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v12, v12, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v22, v22, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v26, v26, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v18 -; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v52, v37, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v55 +; SI-NEXT: v_or_b32_e32 v37, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_or_b32_e32 v55, v37, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v48, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v52, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v55, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 -; SI-NEXT: v_or_b32_e32 v6, v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v6, v35, v34 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v61, v49, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v24 +; SI-NEXT: v_or_b32_e32 v29, v29, v28 +; SI-NEXT: v_or_b32_e32 v54, v54, v51 ; SI-NEXT: v_or_b32_e32 v50, v50, v30 -; SI-NEXT: v_or_b32_e32 v33, v33, v41 -; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 -; SI-NEXT: v_alignbit_b32 v43, v15, v48, 16 -; SI-NEXT: v_alignbit_b32 v30, v54, v30, 16 -; SI-NEXT: v_alignbit_b32 v41, v32, v41, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v6, v37, v3 +; SI-NEXT: v_or_b32_e32 v39, v39, v41 +; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 +; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16 +; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 +; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v6, v35, v1 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_alignbit_b32 v3, v52, v3, 16 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_or_b32_e32 v63, v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_alignbit_b32 v10, v4, v10, 16 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_or_b32_e32 v60, v38, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_or_b32_e32 v58, v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v57, v46, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 +; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v59, v37, v17 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v56, v35, v17 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v59 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_or_b32_e32 v59, v46, v43 +; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v58, v37, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_alignbit_b32 v44, v19, v37, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v57, v38, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v56, v38, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v42 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v51 -; SI-NEXT: v_or_b32_e32 v39, v39, v42 -; SI-NEXT: v_alignbit_b32 v51, v11, v38, 16 -; SI-NEXT: v_alignbit_b32 v42, v35, v42, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v29, v29, v28 +; SI-NEXT: v_or_b32_e32 v47, v35, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v61, v44, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 +; SI-NEXT: v_or_b32_e32 v36, v36, v45 +; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16 +; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v6, v49, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v6, v33, v42 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v17, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v21, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v26, v25, 16 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v36, v36, v49 +; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v28, 16 -; SI-NEXT: v_alignbit_b32 v45, v46, v49, 16 +; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16 +; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v1, v37, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52439,11 +52417,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52454,19 +52430,19 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 @@ -52478,16 +52454,14 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52498,8 +52472,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52510,32 +52484,34 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 516a14edbc260..2bdf994496421 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -5633,12 +5633,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 @@ -5664,10 +5663,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(25) ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8479,30 +8479,30 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX9-LABEL: global_extload_v32bf16_to_v32f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off -; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30 +; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62 +; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60 +; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58 +; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56 +; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54 +; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52 +; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50 +; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48 +; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46 +; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44 +; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42 +; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40 +; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38 +; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36 +; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34 +; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32 +; GFX9-NEXT: global_load_ushort v26, v[1:2], off +; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2 ; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 ; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 ; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 ; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24 +; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 +; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30 ; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 ; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 ; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 @@ -8513,122 +8513,122 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 ; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 ; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 ; GFX9-NEXT: s_waitcnt vmcnt(32) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 ; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 ; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 ; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 -; GFX9-NEXT: s_waitcnt vmcnt(43) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 ; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX9-NEXT: s_waitcnt vmcnt(38) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 ; GFX9-NEXT: s_waitcnt vmcnt(41) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 ; GFX9-NEXT: s_waitcnt vmcnt(41) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 ; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 @@ -8642,25 +8642,25 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8861,72 +8861,72 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:12 -; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:8 -; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:4 -; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:2 +; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:2 +; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12 +; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8 +; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4 ; GFX11-NEXT: global_load_u16 v7, v[1:2], off ; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6 ; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10 ; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14 -; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:28 -; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:24 -; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:20 -; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:18 +; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:18 +; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28 +; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24 +; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20 ; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16 ; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22 ; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26 ; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30 -; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:44 -; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:40 -; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:36 -; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:34 +; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:34 +; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44 +; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40 +; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36 ; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32 ; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38 ; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42 ; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46 -; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:60 -; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:56 -; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:52 -; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:50 +; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:50 +; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60 +; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56 +; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52 ; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48 ; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54 ; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58 ; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62 ; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_waitcnt vmcnt(30) ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4 ; GFX11-NEXT: s_waitcnt vmcnt(29) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(28) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-NEXT: s_waitcnt vmcnt(27) ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 ; GFX11-NEXT: s_waitcnt vmcnt(26) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; GFX11-NEXT: s_waitcnt vmcnt(25) ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX11-NEXT: s_waitcnt vmcnt(24) ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_lshlrev_b32_e32 v102, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v11 ; GFX11-NEXT: s_waitcnt vmcnt(22) -; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12 ; GFX11-NEXT: s_waitcnt vmcnt(21) ; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX11-NEXT: s_waitcnt vmcnt(20) ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15 ; GFX11-NEXT: s_waitcnt vmcnt(18) -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16 ; GFX11-NEXT: s_waitcnt vmcnt(17) ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-NEXT: s_waitcnt vmcnt(16) ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 ; GFX11-NEXT: s_waitcnt vmcnt(14) ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20 ; GFX11-NEXT: s_waitcnt vmcnt(13) @@ -8934,7 +8934,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(12) ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_lshlrev_b32_e32 v103, 16, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23 ; GFX11-NEXT: s_waitcnt vmcnt(10) ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24 ; GFX11-NEXT: s_waitcnt vmcnt(9) @@ -8942,7 +8942,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v27 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28 ; GFX11-NEXT: s_waitcnt vmcnt(5) @@ -8957,36 +8957,36 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v68 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v65 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v29 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v30 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v30 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v52 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v52 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v49 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v49 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v21 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v21 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v34 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v22 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v103 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v22 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v34 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v35 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v36 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v101 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v102 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v100 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v101 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v13 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v14 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v100 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v13 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v14 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v39 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v39 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v38 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v6 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v5 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37 ; GFX11-NEXT: s_clause 0xf @@ -44505,15 +44505,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v9, v11 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_add_f32 v2, v6, v8 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0 +; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v6, v8 ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v7, v6 :: v_dual_lshlrev_b32 v6, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 198bf839cb1cb..3eba106b861c6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -9090,11 +9090,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 @@ -9146,11 +9145,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start @@ -10698,11 +10696,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 @@ -10754,11 +10751,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start @@ -11563,11 +11559,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 @@ -11619,11 +11614,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index bee2813ca30f0..c17225594164f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -7385,11 +7385,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7445,10 +7444,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -7540,11 +7539,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 @@ -7596,11 +7594,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 1826743ed017d..56719dccbd08a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -7385,11 +7385,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7445,10 +7444,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -7540,11 +7539,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 @@ -7596,11 +7594,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 9503ffbdb4104..99b7c7737f4ae 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -910,38 +910,36 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7 -; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:5 +; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:7 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_ffbh_u32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_min_u32_e32 v0, v1, v0 +; SI-NEXT: v_ffbh_u32_e32 v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; SI-NEXT: v_min_u32_e32 v0, v0, v1 ; SI-NEXT: v_min_u32_e32 v0, 64, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 97bcd8b5ee68a..73fddb53d1dcc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -874,38 +874,36 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5 -; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7 -; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:5 +; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:7 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v7 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_ffbl_b32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 +; SI-NEXT: v_ffbl_b32_e32 v1, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_min_u32_e32 v0, v1, v0 ; SI-NEXT: v_min_u32_e32 v0, 64, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index d1090738e24a6..745e047348626 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1568,10 +1568,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX9-NEXT: s_mov_b32 s0, 0x4000405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2 -; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 ; GFX9-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 ; GFX9-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1904,37 +1904,37 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: flat_load_ubyte v8, v[2:3] -; VI-NEXT: flat_load_ubyte v2, v[0:1] +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v12, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[8:9] +; VI-NEXT: flat_load_ubyte v3, v[10:11] ; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[0:1] +; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v9, v[0:1] +; VI-NEXT: flat_load_ubyte v7, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index b1664c59a7e4c..93422e259b827 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -209,27 +209,28 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 -; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 +; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 +; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 +; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:3 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:3 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -492,23 +493,24 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:10 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -808,29 +810,27 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:12 +; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:12 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:10 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 +; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 06c30dfd36033..d95f528442efd 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -522,36 +522,39 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u8 v2, v1 offset:34 -; CI-NEXT: ds_read_u8 v3, v1 offset:32 -; CI-NEXT: ds_read_u8 v4, v1 offset:3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: ds_read_u8 v2, v1 offset:1 +; CI-NEXT: ds_read_u8 v3, v1 offset:34 +; CI-NEXT: ds_read_u8 v4, v1 offset:32 ; CI-NEXT: ds_read_u8 v5, v1 offset:2 -; CI-NEXT: ds_read_u8 v6, v1 offset:1 -; CI-NEXT: ds_read_u8 v7, v1 +; CI-NEXT: ds_read_u8 v6, v1 +; CI-NEXT: ds_read_u8 v7, v1 offset:3 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(7) +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: s_waitcnt lgkmcnt(3) +; CI-NEXT: v_or_b32_e32 v2, v2, v6 +; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v3, v5, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 -; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v2, v2, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -612,36 +615,39 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u8 v2, v1 offset:11 -; CI-NEXT: ds_read_u8 v3, v1 offset:9 -; CI-NEXT: ds_read_u8 v4, v1 offset:8 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 +; CI-NEXT: ds_read_u8 v2, v1 offset:6 +; CI-NEXT: ds_read_u8 v3, v1 offset:11 +; CI-NEXT: ds_read_u8 v4, v1 offset:9 ; CI-NEXT: ds_read_u8 v5, v1 offset:7 -; CI-NEXT: ds_read_u8 v6, v1 offset:6 -; CI-NEXT: ds_read_u8 v7, v1 offset:5 +; CI-NEXT: ds_read_u8 v6, v1 offset:5 +; CI-NEXT: ds_read_u8 v7, v1 offset:8 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(7) +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; CI-NEXT: s_waitcnt lgkmcnt(3) +; CI-NEXT: v_or_b32_e32 v2, v2, v6 +; CI-NEXT: s_waitcnt lgkmcnt(2) +; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; CI-NEXT: v_or_b32_e32 v4, v4, v5 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: v_or_b32_e32 v2, v5, v2 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v3, v5, v3 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 -; CI-NEXT: v_add_f32_e32 v2, v4, v1 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v4 +; CI-NEXT: v_add_f32_e32 v2, v2, v1 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -709,17 +715,17 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; CI-NEXT: ds_read_u16 v2, v1 offset:32 -; CI-NEXT: ds_read_u16 v3, v1 offset:2 +; CI-NEXT: ds_read_u16 v2, v1 offset:2 +; CI-NEXT: ds_read_u16 v3, v1 offset:32 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: v_add_f32_e32 v2, v3, v1 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v1 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -1460,17 +1466,17 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v1, v1, v4 -; CI-NEXT: ds_read_u8 v4, v0 offset:67 -; CI-NEXT: ds_read_u8 v6, v0 offset:66 +; CI-NEXT: ds_read_u8 v4, v0 offset:66 +; CI-NEXT: ds_read_u8 v6, v0 offset:67 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; CI-NEXT: v_or_b32_e32 v2, v2, v4 +; CI-NEXT: v_or_b32_e32 v2, v2, v6 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -1481,26 +1487,25 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:70 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:68 -; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 +; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 +; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 +; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:70 +; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:69 ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v6 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 60334e46a4454..52bcaed7ec75a 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -153,14 +153,14 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v6, vcc, 3, v0 ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-ALIGNED-NEXT: flat_load_ubyte v6, v[6:7] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5] +; GFX7-ALIGNED-NEXT: flat_load_ubyte v5, v[6:7] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v0, v[0:1] ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v5 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index f9694dcd89abf..6f8da57e223e5 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -204,14 +204,14 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 { ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index fab45c9dc3bc3..61f5b73033f5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -569,14 +569,13 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s12 ; GFX11-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v1, s13 -; GFX11-NEXT: v_mov_b32_e32 v3, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s10, v0 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s8, v3 ; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 498df8a65feda..200f74beec385 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -536,8 +536,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x14 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x14 ; SI-NEXT: s_load_dword s1, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 2040aedc250e6..ac438062ae208 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -10546,14 +10546,14 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) { ; GFX6-SDAG-LABEL: freeze_v16p3: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 8, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GFX6-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 48, v0 +; GFX6-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX6-SDAG-NEXT: ds_read_b64 v[2:3], v2 ; GFX6-SDAG-NEXT: ds_read_b64 v[4:5], v4 ; GFX6-SDAG-NEXT: ds_read_b64 v[6:7], v6 @@ -10563,22 +10563,23 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) { ; GFX6-SDAG-NEXT: ds_read_b64 v[14:15], v14 ; GFX6-SDAG-NEXT: ds_read_b64 v[16:17], v16 ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 48, v1 -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; GFX6-SDAG-NEXT: ds_write_b64 v0, v[10:11] +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(4) +; GFX6-SDAG-NEXT: ds_write_b64 v1, v[8:9] +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) +; GFX6-SDAG-NEXT: ds_write_b64 v0, v[14:15] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 56, v1 -; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(2) ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[16:17] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 32, v1 -; GFX6-SDAG-NEXT: ds_write_b64 v0, v[14:15] -; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 40, v1 ; GFX6-SDAG-NEXT: ds_write_b64 v0, v[12:13] +; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; GFX6-SDAG-NEXT: ds_write_b64 v0, v[10:11] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 16, v1 -; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5] +; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 24, v1 -; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3] +; GFX6-SDAG-NEXT: ds_write_b64 v0, v[4:5] ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 8, v1 -; GFX6-SDAG-NEXT: ds_write_b64 v1, v[8:9] -; GFX6-SDAG-NEXT: ds_write_b64 v0, v[6:7] +; GFX6-SDAG-NEXT: ds_write_b64 v0, v[2:3] ; GFX6-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -11565,22 +11566,22 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0 ; GFX6-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; GFX6-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; GFX6-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen -; GFX6-SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; GFX6-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; GFX6-SDAG-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX6-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 @@ -11603,13 +11604,15 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX6-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(7) -; GFX6-SDAG-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_store_dword v13, v7, s[0:3], 0 offen -; GFX6-SDAG-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX6-SDAG-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX6-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX6-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen ; GFX6-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen @@ -11631,24 +11634,24 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX6-GISEL-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX6-GISEL-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; GFX6-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GFX6-GISEL-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GFX6-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; GFX6-GISEL-NEXT: v_add_i32_e32 v8, vcc, 28, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v10, vcc, 32, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 ; GFX6-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 ; GFX6-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen @@ -11669,30 +11672,32 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(12) -; GFX6-GISEL-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 24, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v3, vcc, 24, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) -; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 32, v1 +; GFX6-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX6-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 ; GFX6-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 -; GFX6-GISEL-NEXT: v_add_i32_e32 v6, vcc, 44, v1 -; GFX6-GISEL-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX6-GISEL-NEXT: s_waitcnt expcnt(0) +; GFX6-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 +; GFX6-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v7, v17, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v9, v18, s[0:3], 0 offen -; GFX6-GISEL-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX6-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; GFX6-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX6-GISEL-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen +; GFX6-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 ; GFX6-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX6-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 @@ -11723,22 +11728,22 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, 44, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, 40, v0 ; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; GFX7-SDAG-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v15, v15, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v14, v14, s[0:3], 0 offen -; GFX7-SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; GFX7-SDAG-NEXT: buffer_load_dword v13, v13, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; GFX7-SDAG-NEXT: buffer_load_dword v16, v16, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, 4, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v18, vcc, 8, v1 @@ -11759,13 +11764,15 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, 36, v1 ; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, 44, v1 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(7) -; GFX7-SDAG-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_store_dword v13, v7, s[0:3], 0 offen -; GFX7-SDAG-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) -; GFX7-SDAG-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX7-SDAG-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(9) +; GFX7-SDAG-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen ; GFX7-SDAG-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen @@ -11785,24 +11792,24 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-GISEL-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; GFX7-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GFX7-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 28, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 32, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 40, v0 ; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 44, v0 ; GFX7-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen @@ -11822,28 +11829,29 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX7-GISEL-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 20, v1 -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(12) -; GFX7-GISEL-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 24, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, 24, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v18, vcc, 28, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen -; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 32, v1 +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v4, vcc, 32, v1 +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX7-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 40, v1 ; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, 36, v1 -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 44, v1 -; GFX7-GISEL-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX7-GISEL-NEXT: v_add_i32_e32 v5, vcc, 44, v1 +; GFX7-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v7, v17, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v9, v18, s[0:3], 0 offen -; GFX7-GISEL-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX7-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; GFX7-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX7-GISEL-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen +; GFX7-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 48, v1 ; GFX7-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 52, v1 @@ -11861,24 +11869,24 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-GISEL-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, 8, v0 ; GFX8-GISEL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 8, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 12, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 16, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 20, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 24, v0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 28, v0 +; GFX8-GISEL-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 12, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 16, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 20, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 24, v0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 28, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 36, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 40, v0 ; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 44, v0 ; GFX8-GISEL-NEXT: buffer_load_dword v4, v4, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v5, v5, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v6, v6, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v7, v7, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v8, v8, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_load_dword v9, v9, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v10, v10, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v11, v11, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_load_dword v12, v12, s[0:3], 0 offen @@ -11898,28 +11906,29 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) { ; GFX8-GISEL-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 20, v1 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(12) -; GFX8-GISEL-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 40, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 24, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, 24, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 28, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen -; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 32, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 32, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) +; GFX8-GISEL-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 40, v1 ; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, 36, v1 -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 44, v1 -; GFX8-GISEL-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX8-GISEL-NEXT: v_add_u32_e32 v5, vcc, 44, v1 +; GFX8-GISEL-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v7, v17, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v6, v17, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v9, v18, s[0:3], 0 offen -; GFX8-GISEL-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) ; GFX8-GISEL-NEXT: buffer_store_dword v11, v19, s[0:3], 0 offen ; GFX8-GISEL-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(14) -; GFX8-GISEL-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen +; GFX8-GISEL-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 48, v1 ; GFX8-GISEL-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 52, v1 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 81b8b36180746..a901d7f97eb37 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -3380,42 +3380,117 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i } define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { -; CIGFX89-LABEL: void_func_v32i32_v2i64_v2f64: -; CIGFX89: ; %bb.0: -; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 -; CIGFX89-NEXT: s_mov_b32 s6, -1 -; CIGFX89-NEXT: s_waitcnt vmcnt(8) -; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 -; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: void_func_v32i32_v2i64_v2f64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i64_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[35:38], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[31:34], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: ; GFX11: ; %bb.0: @@ -3552,13 +3627,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3570,29 +3645,29 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: void_func_v32i32_v8i32_v8f32: @@ -3601,13 +3676,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3619,29 +3694,29 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: void_func_v32i32_v8i32_v8f32: @@ -3650,13 +3725,13 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3668,15 +3743,15 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3684,14 +3759,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v8i32_v8f32: @@ -3791,40 +3866,40 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: void_func_v32i32_v16i32_v16f32: @@ -3864,40 +3939,40 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: void_func_v32i32_v16i32_v16f32: @@ -3938,27 +4013,27 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3966,14 +4041,14 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v16i32_v16f32: @@ -4259,9 +4334,9 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 @@ -4275,16 +4350,16 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4292,15 +4367,15 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4308,14 +4383,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 @@ -4324,6 +4391,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: void_func_v32i32_v16i8: @@ -4332,9 +4407,9 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 @@ -4348,16 +4423,16 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4365,15 +4440,15 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4381,14 +4456,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 @@ -4397,6 +4464,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: void_func_v32i32_v16i8: @@ -4405,9 +4480,9 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 @@ -4421,18 +4496,17 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4440,15 +4514,15 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4456,14 +4530,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 @@ -4472,6 +4538,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: void_func_v32i32_v16i8: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 75619532a2e37..668219875db72 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2450,22 +2450,21 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 +; GFX10-NEXT: s_clause 0x8 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:284 ; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:280 ; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:276 @@ -2499,16 +2498,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 3e15b135eeab9..b7b69ed9f53ba 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -4276,14 +4276,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s2, s0 ; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4523,14 +4523,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX9-NEXT: s_add_u32 s0, s2, s0 ; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index d28f0a190e117..9e7968f1acb84 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -2903,13 +2903,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] ; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2925,12 +2925,12 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] ; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c +; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c01 ; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0xc020101 @@ -2950,12 +2950,12 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c +; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c01 ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020101 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index df77757443391..f995f426c6372 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -4454,13 +4454,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] ; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 @@ -4476,12 +4476,12 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] ; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c +; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c01 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 @@ -4500,12 +4500,12 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c +; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c01 ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020101 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index f6330f4eb8216..be16fac4c53f7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1101,14 +1101,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8 ; SI-NEXT: s_load_dword s11, s[8:9], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cselect_b32 s3, s11, s3 ; SI-NEXT: s_cmp_eq_u32 s10, 2 ; SI-NEXT: s_cselect_b32 s2, s11, s2 @@ -1125,14 +1125,14 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20 ; VI-NEXT: s_load_dword s11, s[8:9], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s10, 3 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_cselect_b32 s3, s11, s3 ; VI-NEXT: s_cmp_eq_u32 s10, 2 ; VI-NEXT: s_cselect_b32 s2, s11, s2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 9df995b5a7066..a18b5b5396f63 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1664,8 +1664,8 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[4:5], 0xf ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xf ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5191,16 +5191,16 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s0, 2 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s4, 42 -; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: s_add_u32 s0, s4, 42 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_ushort v4, v[4:5] +; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5208,10 +5208,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_byte v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_short v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index f7c37caf41eab..393d8c1a1bf2f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -69,14 +69,13 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm @@ -108,14 +107,13 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm @@ -147,14 +145,13 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 ; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index c0afc0a443955..49a334b8b6c52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -984,9 +984,9 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 @@ -1033,9 +1033,9 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 ; CHECK-SDAG-NEXT: ;;#ASMSTART @@ -1429,9 +1429,9 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 +; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 ; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 @@ -1478,9 +1478,9 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v0 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 ; CHECK-SDAG-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index a18e5ace18704..f971080e02c5b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2522,9 +2522,9 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-LABEL: v_maximum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x19 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-NEXT: s_clause 0x18 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 @@ -2548,22 +2548,21 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_max_f64 v[82:83], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] -; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: s_waitcnt vmcnt(21) ; GFX10-NEXT: v_max_f64 v[84:85], v[2:3], v[33:34] ; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[33:34] -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(19) ; GFX10-NEXT: v_max_f64 v[32:33], v[4:5], v[35:36] ; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 @@ -2593,9 +2592,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_max_f64 v[50:51], v[18:19], v[80:81] ; GFX10-NEXT: v_max_f64 v[70:71], v[22:23], v[68:69] ; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_max_f64 v[68:69], v[24:25], v[66:67] -; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v35, 0x7ff80000, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, 0, s7 @@ -2614,27 +2610,30 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_max_f64 v[68:69], v[24:25], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_max_f64 v[80:81], v[28:29], v[0:1] -; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[0:1] +; GFX10-NEXT: v_max_f64 v[66:67], v[26:27], v[0:1] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_max_f64 v[66:67], v[26:27], v[2:3] -; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo +; GFX10-NEXT: v_max_f64 v[80:81], v[28:29], v[2:3] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f64 v[86:87], v[30:31], v[4:5] ; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v32, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v33, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v30, v86, 0, s18 ; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 4c413af878462..dfd67873c3b86 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2522,9 +2522,9 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-LABEL: v_minimum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x19 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-NEXT: s_clause 0x18 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 @@ -2548,22 +2548,21 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(23) ; GFX10-NEXT: v_min_f64 v[82:83], v[0:1], v[31:32] ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[31:32] -; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: s_waitcnt vmcnt(21) ; GFX10-NEXT: v_min_f64 v[84:85], v[2:3], v[33:34] ; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[33:34] -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: s_waitcnt vmcnt(19) ; GFX10-NEXT: v_min_f64 v[32:33], v[4:5], v[35:36] ; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[35:36] -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 @@ -2593,9 +2592,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_min_f64 v[50:51], v[18:19], v[80:81] ; GFX10-NEXT: v_min_f64 v[70:71], v[22:23], v[68:69] ; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_min_f64 v[68:69], v[24:25], v[66:67] -; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v35, 0x7ff80000, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, 0, s7 @@ -2614,27 +2610,30 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX10-NEXT: v_cndmask_b32_e64 v21, v65, 0x7ff80000, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v70, 0, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v23, v71, 0x7ff80000, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_min_f64 v[68:69], v[24:25], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_min_f64 v[80:81], v[28:29], v[0:1] -; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[0:1] +; GFX10-NEXT: v_min_f64 v[66:67], v[26:27], v[0:1] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_min_f64 v[66:67], v[26:27], v[2:3] -; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo +; GFX10-NEXT: v_min_f64 v[80:81], v[28:29], v[2:3] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_f64 v[86:87], v[30:31], v[4:5] ; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v82, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v83, 0x7ff80000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v84, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v85, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v32, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v33, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 ; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v80, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v81, 0x7ff80000, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v30, v86, 0, s18 ; GFX10-NEXT: v_cndmask_b32_e64 v31, v87, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index cce5022213291..8e312a0e195ff 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -760,34 +760,34 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_clause 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] +; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:12 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] offset:16 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4 -; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: constant_load_v16i16_align2: @@ -796,34 +796,34 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_clause 0x7 -; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 -; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 -; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 -; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1] offset:16 -; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:12 -; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 -; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 -; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] +; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:12 +; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28 +; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24 +; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20 +; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] offset:16 +; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8 +; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1] ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 -; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x4 -; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-FAKE16-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll index a6ce512164b89..8a3cc57e08579 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -7,13 +7,13 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v1 +; CHECK-NEXT: ds_read_b32 v3, v1 +; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v1, v4 -; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: exp mrt0 off, off, off, off @@ -69,36 +69,36 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8) ; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1 -; CHECK-NEXT: v_mov_b32_e32 v9, s0 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, s0 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v6, v0 ; CHECK-NEXT: ds_read_b32 v5, v3 ; CHECK-NEXT: ds_read_b32 v4, v4 -; CHECK-NEXT: ds_read_b32 v8, v6 -; CHECK-NEXT: ds_read_b32 v7, v7 -; CHECK-NEXT: ds_read_b32 v6, v0 +; CHECK-NEXT: ds_read_b32 v8, v7 +; CHECK-NEXT: ds_read_b32 v7, v9 ; CHECK-NEXT: ds_read_b32 v3, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 16, v2 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc -; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc +; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: ds_read_b32 v4, v11 +; CHECK-NEXT: ds_read_b32 v5, v11 +; CHECK-NEXT: ds_read_b32 v4, v12 ; CHECK-NEXT: ds_read_b32 v3, v0 ; CHECK-NEXT: ds_read_b32 v1, v1 -; CHECK-NEXT: ds_read_b32 v0, v12 -; CHECK-NEXT: ds_read_b32 v5, v10 +; CHECK-NEXT: ds_read_b32 v0, v9 ; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: exp mrt0 off, off, off, off ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc -; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc +; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc +; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4 %vec11 = shufflevector <6 x float> %load1, <6 x float> poison, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll index 10dca76cc389a..d634e40f1d79b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -95,51 +95,51 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 ; GFX7-NEXT: ds_read_u8 v5, v0 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:15 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v11, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:15 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v10, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 @@ -331,21 +331,21 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 -; GFX7-NEXT: ds_read_u16 v4, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v4, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v5, v0 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 ; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll index 2da3fce72072e..b917b48b90e6a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -86,41 +86,41 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 ; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 ; GFX7-NEXT: ds_read_u8 v5, v0 ; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 ; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 ; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(7) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 @@ -274,19 +274,19 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 -; GFX7-NEXT: ds_read_u16 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v4, v0 ; GFX7-NEXT: ds_read_u16 v5, v0 offset:6 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(5) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index 1857eaba0a2a9..1e246465ab1e3 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -139,17 +139,17 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: global_load_dword v4, v0, s[2:3] +; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 8d020b9e1a603..0003366f3a3ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -173,54 +173,53 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: v_mov_b32_e32 v26, s0 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 ; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 ; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 ; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 ; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 ; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: s_waitcnt vmcnt(18) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -464,54 +463,53 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: v_mov_b32_e32 v26, s0 ; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 ; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 ; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 ; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 ; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 ; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 ; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 -; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: s_waitcnt vmcnt(18) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:8 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index cadc3dadb0a1e..b43ccc551ca95 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -451,12 +451,12 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 @@ -506,12 +506,12 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 @@ -896,19 +896,18 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -924,18 +923,18 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -966,19 +965,18 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -994,18 +992,18 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index ee52611fae5ca..9cc42ac448067 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -3583,104 +3583,102 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: .LBB4_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 ; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 ; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 ; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 ; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 ; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 ; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(41) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:80 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -3748,16 +3746,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 @@ -3798,10 +3797,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) @@ -3811,46 +3809,46 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -3858,27 +3856,27 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 @@ -3888,13 +3886,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -3903,13 +3901,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill @@ -3955,9 +3953,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -4240,7 +4236,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v125 @@ -4252,7 +4248,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v123, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -4286,7 +4282,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 ; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164 ; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:166 ; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 8, v89 @@ -4294,7 +4290,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v79 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -4303,20 +4299,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 ; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 ; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:175 ; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:168 ; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:169 ; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v58 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v57, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -4326,7 +4322,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177 ; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:180 ; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182 ; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183 @@ -4338,7 +4334,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 @@ -4373,15 +4369,16 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:198 ; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:199 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v98, 8, v100 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v98, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v99 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v96 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v97, 8, v99 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:204 @@ -4492,23 +4489,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v124, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v123, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen ; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27 ; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v44, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v12, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v58, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v104, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4517,35 +4514,35 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v13 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v101, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v44, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v44, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 16, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v44, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v84, v45, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v7, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v60, 16, v45 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v44, v44, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v58, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v44, v90, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v95, 8, v92 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v45, v45, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v60, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v90, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v104, 8, v92 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12 @@ -4554,34 +4551,34 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v44, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v111, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v58, v110, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v110, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v44, v92, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v92, 8, v104 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v58, v94, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v58, 16, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v60, v94, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v44 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v60 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v58, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v45, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4606,9 +4603,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] @@ -4713,7 +4710,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:183 ; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:181 ; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4730,17 +4727,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:175 ; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 ; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:168 ; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 ; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 ; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:161 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload @@ -4768,7 +4765,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:151 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 @@ -5235,11 +5232,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:18 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -5272,7 +5269,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:8 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 @@ -12461,97 +12458,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 ; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 ; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 ; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 ; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172 ; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188 ; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184 ; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180 ; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 ; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 ; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 ; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 ; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:160 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] ; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 ; CHECK-NEXT: .LBB9_2: ; %Flow10 @@ -12565,103 +12562,101 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 ; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 ; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 ; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 ; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 ; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 ; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 ; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:208 ; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 ; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 ; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 ; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:176 ; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 -; CHECK-NEXT: s_waitcnt vmcnt(41) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(37) +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(32) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(24) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:80 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; CHECK-NEXT: s_cbranch_scc0 .LBB9_4 ; CHECK-NEXT: .LBB9_5: ; %Flow11 @@ -12736,16 +12731,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 @@ -12768,16 +12764,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 @@ -12786,10 +12782,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) @@ -12799,46 +12794,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -12846,26 +12841,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v39, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 @@ -12875,13 +12871,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -12890,13 +12886,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -12922,11 +12918,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -12942,9 +12938,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill @@ -13227,39 +13222,39 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v110, 8, v122 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:152 ; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153 ; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v95 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v95, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 @@ -13268,8 +13263,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 ; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 ; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:166 ; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 @@ -13277,9 +13272,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v75 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 @@ -13356,15 +13351,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:198 ; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:199 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v100, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v102 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v97 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v96, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v98 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v99, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:204 @@ -13380,10 +13376,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:201 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:202 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v69 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 @@ -13475,11 +13471,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(27) @@ -13490,7 +13486,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v13, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) ; ALIGNED-NEXT: v_lshl_or_b32 v91, v9, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v94, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v78, v4, 16, v3 @@ -13502,14 +13498,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v86, v77, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v11, 8, v12 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v71, v91, 16, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v91, 16, v77 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v6, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v7, 8, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v91, 16, v77 @@ -13527,7 +13523,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v0, 8, v91 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v3, 8, v125 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill @@ -13560,21 +13556,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v77 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v91 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v91, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v77, 8, v107 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -13596,8 +13592,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 ; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 ; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 @@ -13663,7 +13659,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:201 ; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:207 ; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:205 ; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:206 @@ -13729,8 +13725,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 ; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:161 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload @@ -13746,20 +13742,20 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:155 ; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:153 ; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:159 ; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:157 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:158 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:158 ; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:152 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:152 ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:146 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:147 ; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:151 +; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:151 ; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:149 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload @@ -14219,11 +14215,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -14236,7 +14232,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16 +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 @@ -14305,16 +14301,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 @@ -14355,10 +14352,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill @@ -14369,46 +14365,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v2, v9, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: s_waitcnt vmcnt(38) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) +; ALIGNED-NEXT: s_waitcnt vmcnt(36) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(34) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) +; ALIGNED-NEXT: s_waitcnt vmcnt(32) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 @@ -14416,27 +14412,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: s_waitcnt vmcnt(24) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: s_waitcnt vmcnt(22) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(18) +; ALIGNED-NEXT: s_waitcnt vmcnt(17) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: s_waitcnt vmcnt(15) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -14446,13 +14442,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(9) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -14461,13 +14457,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -14513,9 +14509,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -14840,21 +14834,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160 ; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v73, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:162 ; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163 ; ALIGNED-NEXT: buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:166 ; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v75 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 @@ -15064,7 +15058,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 ; ALIGNED-NEXT: v_lshl_or_b32 v101, v3, 16, v2 @@ -15129,9 +15123,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:484 @@ -15140,10 +15134,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v43 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v57 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v57, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v43, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -15164,7 +15158,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 @@ -15294,12 +15288,12 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:174 ; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:172 ; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v73 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:162 ; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:163 ; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:161 ; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:166 ; ALIGNED-NEXT: flat_store_byte v[2:3], v88 offset:164 ; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload @@ -15798,11 +15792,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:18 +; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index 4e5688adcd6bb..f08ea27040fb5 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -485,12 +485,12 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 @@ -540,12 +540,12 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 ; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 ; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 ; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) ; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 @@ -939,19 +939,18 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -967,18 +966,18 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1009,19 +1008,18 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -1037,18 +1035,18 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1079,20 +1077,20 @@ define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -1149,20 +1147,20 @@ define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -2079,20 +2077,18 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -2147,20 +2143,18 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -2215,20 +2209,18 @@ define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -2283,20 +2275,18 @@ define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 +; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 @@ -3266,21 +3256,20 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write_b32 v0, v9 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -3339,21 +3328,20 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write_b32 v0, v9 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -3485,22 +3473,21 @@ define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x8 -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: ds_write2_b32 v0, v7, v8 offset0:5 offset1:6 -; CHECK-NEXT: ds_write_b32 v0, v6 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 +; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6 +; CHECK-NEXT: ds_write_b32 v0, v7 offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: ds_write_b16 v0, v10 offset:28 +; CHECK-NEXT: ds_write_b8 v0, v6 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0f47a31f52dcb..b5e7589cbd134 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2689,45 +2689,45 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x7c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x4c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s5, s12, s10 -; GFX9-NEXT: s_mul_i32 s6, s14, s9 -; GFX9-NEXT: s_mul_hi_u32 s7, s14, s8 +; GFX9-NEXT: s_mul_i32 s4, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s14 +; GFX9-NEXT: s_mul_i32 s6, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s7, s10, s12 ; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s13, s10 +; GFX9-NEXT: s_mul_i32 s5, s9, s14 ; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_mul_i32 s7, s15, s8 +; GFX9-NEXT: s_mul_i32 s7, s11, s12 ; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_i32 s5, s12, s10 +; GFX9-NEXT: s_mul_i32 s5, s8, s14 ; GFX9-NEXT: s_add_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_i32 s7, s14, s8 +; GFX9-NEXT: s_mul_i32 s7, s10, s12 ; GFX9-NEXT: s_add_u32 s7, s7, s5 ; GFX9-NEXT: s_addc_u32 s6, s6, s4 -; GFX9-NEXT: s_mul_i32 s14, s9, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 +; GFX9-NEXT: s_mul_i32 s14, s13, s8 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s13, s8 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s5, s8, s13 +; GFX9-NEXT: s_mul_i32 s5, s12, s9 ; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s10, s12, s9 ; GFX9-NEXT: s_add_u32 s5, s5, s14 ; GFX9-NEXT: s_addc_u32 s10, s10, 0 ; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_addc_u32 s11, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 -; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s13, s9 +; GFX9-NEXT: s_mul_i32 s9, s13, s9 ; GFX9-NEXT: s_add_u32 s9, s9, s10 ; GFX9-NEXT: s_addc_u32 s10, s14, s11 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_add_u32 s9, s9, s7 ; GFX9-NEXT: s_addc_u32 s10, s10, s6 -; GFX9-NEXT: s_mul_i32 s6, s8, s12 +; GFX9-NEXT: s_mul_i32 s6, s12, s8 ; GFX9-NEXT: s_mov_b32 s7, s4 ; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index cc9650b9a7309..1abd2e6b60f2f 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -355,15 +355,15 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b -; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index b4e5fa088b533..a4ddfee115fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -3656,14 +3656,14 @@ define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-LABEL: extract_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6 -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off ; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v8 ; GFX9-NEXT: global_store_dword v[4:5], v1, off ; GFX9-NEXT: global_store_dword v[6:7], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index cc299789af8dd..69983faf2b154 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s @@ -116,3 +117,6 @@ body: | S_ENDPGM 0 ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# GCN: {{.*}} +# GCN-GCNTRACKER: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b1a618ae4d5c5..c4842c1f4f523 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1193,9 +1193,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 2 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v10 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo @@ -1203,38 +1203,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dword v11, v[0:1], off +; GFX10-NEXT: global_load_dword v12, v[0:1], off offset:1024 +; GFX10-NEXT: global_load_dword v13, v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dword v14, v[2:3], off offset:1024 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1000, v0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dword v9, v[0:1], off -; GFX10-NEXT: global_load_dword v10, v[0:1], off offset:1024 -; GFX10-NEXT: global_load_dword v11, v[2:3], off offset:1024 -; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dword v13, v[4:5], off ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v14, v[6:7], off offset:1024 -; GFX10-NEXT: global_load_dword v15, v[2:3], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dword v15, v[4:5], off +; GFX10-NEXT: global_load_dword v16, v[6:7], off offset:1024 +; GFX10-NEXT: global_load_dword v17, v[2:3], off offset:1024 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dword v2, v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dword v3, v[4:5], off -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:1024 +; GFX10-NEXT: global_load_dword v2, v[8:9], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[8:9], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:1024 ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_add_nc_u32_e32 v0, v10, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v12, v11 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add3_u32 v0, v12, v0, v11 -; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add3_u32 v0, v15, v0, v16 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add3_u32 v0, v2, v0, v15 +; GFX10-NEXT: v_add3_u32 v0, v2, v0, v17 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add3_u32 v0, v3, v0, v6 -; GFX10-NEXT: global_store_dword v8, v0, s[34:35] +; GFX10-NEXT: v_add3_u32 v0, v3, v0, v4 +; GFX10-NEXT: global_store_dword v10, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: Address32: @@ -1375,19 +1375,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4] -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4 ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc @@ -1429,14 +1429,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 ; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc @@ -1521,15 +1521,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo @@ -1686,26 +1686,26 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v4 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v6 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x80000000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:-2048 -; GFX10-NEXT: global_load_dword v7, v[2:3], off -; GFX10-NEXT: global_load_dword v8, v[0:1], off offset:1024 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7ffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dword v7, v[0:1], off +; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:-2048 +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: global_load_dword v10, v[4:5], off offset:1024 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_nc_u32_e32 v0, v6, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add3_u32 v0, v8, v0, v7 -; GFX10-NEXT: global_store_dword v4, v0, s[34:35] +; GFX10-NEXT: v_add3_u32 v0, v10, v0, v9 +; GFX10-NEXT: global_store_dword v6, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: p32Offset64: @@ -2160,25 +2160,25 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 -; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX9-NEXT: s_movk_i32 s0, 0x2000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:2048 ; GFX9-NEXT: s_movk_i32 s0, 0x1000 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 ; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 2863fccac9fbc..51de691e0eccc 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -3214,11 +3214,11 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 ; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 @@ -3249,10 +3249,9 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 ; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 ; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 ; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc ; VI-NEXT: s_waitcnt vmcnt(10) diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 593cff712004a..a82a6a8a4c367 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -878,19 +878,19 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; SI-NEXT: v_lshl_b64 v[5:6], v[5:6], v2 +; SI-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 ; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index ce89b2a962eea..0b49b9c815da5 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -563,19 +563,19 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 +; SI-NEXT: v_ashr_i64 v[5:6], v[5:6], v2 +; SI-NEXT: v_ashr_i64 v[3:4], v[3:4], v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 ; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -589,19 +589,19 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] -; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] +; VI-NEXT: v_ashrrev_i64 v[5:6], v2, v[5:6] +; VI-NEXT: v_ashrrev_i64 v[3:4], v0, v[3:4] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] ; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index badb1f6fe9847..239de43baa457 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,19 +266,19 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], v2 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index dcf3643756cb2..2efa022efd70f 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -279,36 +279,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-NEXT: s_lshr_b32 s3, s3, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5 -; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s3, s3, 24 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24 ; GFX11-NEXT: s_lshr_b32 s7, s0, 8 ; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6 +; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_mov_b32_e32 v10, s1 ; GFX11-NEXT: ds_store_b8 v0, v2 offset:8 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:10 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:12 -; GFX11-NEXT: ds_store_b8 v0, v4 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v5 offset:13 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:14 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:13 ; GFX11-NEXT: ds_store_b8 v0, v6 offset:15 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7 -; GFX11-NEXT: v_mov_b32_e32 v11, s0 ; GFX11-NEXT: ds_store_b8 v0, v7 offset:9 ; GFX11-NEXT: ds_store_b8 v0, v8 offset:11 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:5 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: ds_store_b8 v0, v4 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:7 -; GFX11-NEXT: ds_store_b8 v0, v10 offset:1 -; GFX11-NEXT: ds_store_b8 v0, v11 offset:3 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:5 +; GFX11-NEXT: ds_store_b8 v0, v10 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:3 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 1 ret void @@ -420,17 +421,17 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_mov_b32_e32 v4, s2 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14 -; GFX11-NEXT: ds_store_b16 v0, v2 -; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b16 v0, v4 offset:8 -; GFX11-NEXT: ds_store_b16 v0, v1 offset:12 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:10 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v4, s3 +; GFX11-NEXT: ds_store_b16 v0, v1 +; GFX11-NEXT: ds_store_b16 v0, v2 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:8 +; GFX11-NEXT: ds_store_b16 v0, v4 offset:12 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:14 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:10 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:2 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 70906d8474aa5..03a7ec4883ff8 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -239,25 +239,26 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24 ; GFX11-NEXT: s_lshr_b32 s5, s0, 8 ; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 ; GFX11-NEXT: ds_store_b8 v0, v3 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 ; GFX11-NEXT: ds_store_b8 v0, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 ; GFX11-NEXT: ds_store_b8 v0, v7 offset:7 ; GFX11-NEXT: ds_store_b8 v0, v8 offset:1 ; GFX11-NEXT: ds_store_b8 v0, v9 offset:3 @@ -356,14 +357,14 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b16 v0, v2 -; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: ds_store_b16 v0, v1 +; GFX11-NEXT: ds_store_b16 v0, v2 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:8 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:10 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:2 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index a3aeea8a145cd..ec065b4daa376 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -967,20 +967,20 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:16 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc @@ -993,22 +993,22 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_load_b128 v[0:3], v12, s[2:3] -; GFX12-NEXT: global_load_b128 v[4:7], v12, s[4:5] +; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] +; GFX12-NEXT: global_load_b128 v[4:7], v12, s[2:3] ; GFX12-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:16 -; GFX12-NEXT: global_load_b128 v[12:15], v12, s[4:5] offset:16 +; GFX12-NEXT: global_load_b128 v[12:15], v12, s[6:7] offset:16 ; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v7, vcc_lo +; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v7, v3, vcc_lo ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 ; GFX12-NEXT: s_wait_alu 0xfffd @@ -1016,9 +1016,9 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_sub_co_ci_u32_e64 v9, null, v9, v13, vcc_lo -; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v5, v1, vcc_lo ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index a56346f3bb45b..74e536f813716 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -37,22 +37,23 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s8, s[4:5], 0x26 -; GFX6-NEXT: s_load_dword s9, s[4:5], 0x1d ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dword s9, s[4:5], 0x1d ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_mul_i32 s10, s10, s8 @@ -69,7 +70,6 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_cselect_b32 s8, s10, s9 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 @@ -79,7 +79,6 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x98 -; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX8-NEXT: s_sub_i32 s0, 0, s6 @@ -87,6 +86,7 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll index 5278c951ecddf..69519c00f88ea 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1481,38 +1481,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a7, 5(a0) -; RV32I-NEXT: lbu t0, 6(a0) -; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu t0, 0(a1) ; RV32I-NEXT: lbu t1, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or t2, a6, t0 ; RV32I-NEXT: li a6, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: li t1, 32 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or t0, a5, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: or t0, a4, a3 +; RV32I-NEXT: or a4, a7, a5 +; RV32I-NEXT: or a5, a1, t2 ; RV32I-NEXT: slli a5, a5, 3 ; RV32I-NEXT: neg t3, a5 ; RV32I-NEXT: srl t4, t0, t3 @@ -1810,38 +1810,38 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; ; RV32I-LABEL: shl_16bytes_wordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a7, 5(a0) -; RV32I-NEXT: lbu t0, 6(a0) -; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu t0, 5(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu t0, 0(a1) ; RV32I-NEXT: lbu t1, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or t2, a6, t0 ; RV32I-NEXT: li a6, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: li t1, 32 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or t0, a5, a3 -; RV32I-NEXT: or a4, t2, a4 -; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: or t0, a4, a3 +; RV32I-NEXT: or a4, a7, a5 +; RV32I-NEXT: or a5, a1, t2 ; RV32I-NEXT: slli a5, a5, 5 ; RV32I-NEXT: neg t3, a5 ; RV32I-NEXT: srl t4, t0, t3 @@ -5781,26 +5781,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or t0, t3, t2 ; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 3 ; RV32I-NEXT: neg s10, a4 ; RV32I-NEXT: srl t5, t3, s10 @@ -6695,26 +6695,26 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or t0, t3, t2 ; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 5 ; RV32I-NEXT: neg s10, a4 ; RV32I-NEXT: srl t5, t3, s10 @@ -7609,26 +7609,26 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or t0, t3, t2 ; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 6 ; RV32I-NEXT: neg s10, a4 ; RV32I-NEXT: srl t5, t3, s10 diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index c9a48acb8d14a..3fb0f2c53bdf0 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -625,9 +625,9 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) -; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t3, t0, a6 ; RV32I-NEXT: mv t4, t3 @@ -744,9 +744,9 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) -; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t3, t0, a6 ; RV32ZBB-NEXT: mv t4, t3 @@ -872,9 +872,9 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) -; RV32I-NEXT: lw a1, 0(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t3, t0, a6 ; RV32I-NEXT: mv t4, t3 @@ -991,9 +991,9 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) -; RV32ZBB-NEXT: lw a1, 0(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t3, t0, a6 ; RV32ZBB-NEXT: mv t4, t3 @@ -1385,8 +1385,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a6, 4(a2) ; RV32I-NEXT: lw a7, 8(a2) ; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: beq a5, t0, .LBB17_2 ; RV32I-NEXT: # %bb.1: @@ -1512,8 +1512,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a6, 4(a2) ; RV32ZBB-NEXT: lw a7, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a3, 4(a1) +; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a4, 8(a1) ; RV32ZBB-NEXT: beq a5, t0, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: @@ -1864,15 +1864,15 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a7, .LBB22_2 +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq t1, a7, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t0, a7 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB22_2: ; RV32I-NEXT: sltu t2, a2, a3 ; RV32I-NEXT: mv t3, t2 @@ -1880,7 +1880,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: sltu t3, a1, a4 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a7 +; RV32I-NEXT: xor t5, t1, a7 ; RV32I-NEXT: xor t6, a6, a5 ; RV32I-NEXT: or t5, t6, t5 ; RV32I-NEXT: mv t6, t3 @@ -1896,11 +1896,11 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB22_8: ; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t1, a5, a6 -; RV32I-NEXT: sub a7, a7, t0 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 ; RV32I-NEXT: sub a5, a5, a6 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sub a6, a7, t0 ; RV32I-NEXT: sltu a7, a5, t5 ; RV32I-NEXT: sub a1, a5, t5 ; RV32I-NEXT: sub a5, a4, t4 @@ -1908,10 +1908,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a4, a1, a4 -; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sub a6, a7, t0 ; RV32I-NEXT: sltu a7, a5, t3 ; RV32I-NEXT: sub a1, a5, t3 ; RV32I-NEXT: sub a5, a4, t2 @@ -1951,15 +1951,15 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a6, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a7, .LBB22_2 +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq t1, a7, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t0, a7 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB22_2: ; RV32ZBB-NEXT: sltu t2, a2, a3 ; RV32ZBB-NEXT: mv t3, t2 @@ -1967,7 +1967,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: sltu t3, a1, a4 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a7 +; RV32ZBB-NEXT: xor t5, t1, a7 ; RV32ZBB-NEXT: xor t6, a6, a5 ; RV32ZBB-NEXT: or t5, t6, t5 ; RV32ZBB-NEXT: mv t6, t3 @@ -1983,11 +1983,11 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB22_8: ; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t1, a5, a6 -; RV32ZBB-NEXT: sub a7, a7, t0 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 ; RV32ZBB-NEXT: sub a5, a5, a6 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sub a6, a7, t0 ; RV32ZBB-NEXT: sltu a7, a5, t5 ; RV32ZBB-NEXT: sub a1, a5, t5 ; RV32ZBB-NEXT: sub a5, a4, t4 @@ -1995,10 +1995,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a4, a1, a4 -; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sub a6, a7, t0 ; RV32ZBB-NEXT: sltu a7, a5, t3 ; RV32ZBB-NEXT: sub a1, a5, t3 ; RV32ZBB-NEXT: sub a5, a4, t2 diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 56e6dacff9748..efb4e1a6f15d6 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -536,73 +536,73 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB11_2 +; RV32I-NEXT: lw a2, 4(a2) +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq a7, t1, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a1, a3 +; RV32I-NEXT: sltu t5, a2, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB11_4 +; RV32I-NEXT: beq a4, a2, .LBB11_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB11_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor t6, a7, t1 +; RV32I-NEXT: xor s0, a5, a6 ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB11_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB11_6: ; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB11_8 +; RV32I-NEXT: beq a2, a4, .LBB11_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB11_8: -; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB11_10 +; RV32I-NEXT: beq a4, a2, .LBB11_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB11_10: ; RV32I-NEXT: bnez t3, .LBB11_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a4, t0, a2 -; RV32I-NEXT: sltu a5, a6, t4 -; RV32I-NEXT: sub a2, a1, t2 -; RV32I-NEXT: sub a1, a4, a5 -; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: sub a4, a7, t0 +; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sub a3, a2, t2 +; RV32I-NEXT: sub a2, a4, a6 +; RV32I-NEXT: sub a4, a5, t4 ; RV32I-NEXT: j .LBB11_13 ; RV32I-NEXT: .LBB11_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, t0, a2 -; RV32I-NEXT: sltu a5, a6, t6 -; RV32I-NEXT: sub a2, a4, t5 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a2, a7, t0 +; RV32I-NEXT: sltu a6, a5, t6 +; RV32I-NEXT: sub a3, a4, t5 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a4, a5, t6 ; RV32I-NEXT: .LBB11_13: -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -632,73 +632,73 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB11_2 +; RV32ZBB-NEXT: lw a2, 4(a2) +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq a7, t1, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a1, a3 +; RV32ZBB-NEXT: sltu t5, a2, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB11_4 +; RV32ZBB-NEXT: beq a4, a2, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB11_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor t6, a7, t1 +; RV32ZBB-NEXT: xor s0, a5, a6 ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB11_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB11_6: ; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB11_8 +; RV32ZBB-NEXT: beq a2, a4, .LBB11_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB11_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB11_10 +; RV32ZBB-NEXT: beq a4, a2, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB11_10: ; RV32ZBB-NEXT: bnez t3, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a4, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t4 -; RV32ZBB-NEXT: sub a2, a1, t2 -; RV32ZBB-NEXT: sub a1, a4, a5 -; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a2, a2, a4 +; RV32ZBB-NEXT: sub a4, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sub a3, a2, t2 +; RV32ZBB-NEXT: sub a2, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t4 ; RV32ZBB-NEXT: j .LBB11_13 ; RV32ZBB-NEXT: .LBB11_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t6 -; RV32ZBB-NEXT: sub a2, a4, t5 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a2, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t6 +; RV32ZBB-NEXT: sub a3, a4, t5 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a4, a5, t6 ; RV32ZBB-NEXT: .LBB11_13: -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -736,73 +736,73 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB12_2 +; RV32I-NEXT: lw a2, 4(a2) +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq a7, t1, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a1, a3 +; RV32I-NEXT: sltu t5, a2, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB12_4 +; RV32I-NEXT: beq a4, a2, .LBB12_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB12_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor t6, a7, t1 +; RV32I-NEXT: xor s0, a5, a6 ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB12_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB12_6: ; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB12_8 +; RV32I-NEXT: beq a2, a4, .LBB12_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB12_8: -; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB12_10 +; RV32I-NEXT: beq a4, a2, .LBB12_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB12_10: ; RV32I-NEXT: bnez t3, .LBB12_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a4, t0, a2 -; RV32I-NEXT: sltu a5, a6, t4 -; RV32I-NEXT: sub a2, a1, t2 -; RV32I-NEXT: sub a1, a4, a5 -; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: sub a4, a7, t0 +; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sub a3, a2, t2 +; RV32I-NEXT: sub a2, a4, a6 +; RV32I-NEXT: sub a4, a5, t4 ; RV32I-NEXT: j .LBB12_13 ; RV32I-NEXT: .LBB12_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, t0, a2 -; RV32I-NEXT: sltu a5, a6, t6 -; RV32I-NEXT: sub a2, a4, t5 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a2, a7, t0 +; RV32I-NEXT: sltu a6, a5, t6 +; RV32I-NEXT: sub a3, a4, t5 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a4, a5, t6 ; RV32I-NEXT: .LBB12_13: -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -832,73 +832,73 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB12_2 +; RV32ZBB-NEXT: lw a2, 4(a2) +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq a7, t1, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a1, a3 +; RV32ZBB-NEXT: sltu t5, a2, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB12_4 +; RV32ZBB-NEXT: beq a4, a2, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB12_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor t6, a7, t1 +; RV32ZBB-NEXT: xor s0, a5, a6 ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB12_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB12_6: ; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB12_8 +; RV32ZBB-NEXT: beq a2, a4, .LBB12_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB12_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB12_10 +; RV32ZBB-NEXT: beq a4, a2, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB12_10: ; RV32ZBB-NEXT: bnez t3, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a4, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t4 -; RV32ZBB-NEXT: sub a2, a1, t2 -; RV32ZBB-NEXT: sub a1, a4, a5 -; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a2, a2, a4 +; RV32ZBB-NEXT: sub a4, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sub a3, a2, t2 +; RV32ZBB-NEXT: sub a2, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t4 ; RV32ZBB-NEXT: j .LBB12_13 ; RV32ZBB-NEXT: .LBB12_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t6 -; RV32ZBB-NEXT: sub a2, a4, t5 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a2, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t6 +; RV32ZBB-NEXT: sub a3, a4, t5 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a4, a5, t6 ; RV32ZBB-NEXT: .LBB12_13: -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1125,73 +1125,73 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB17_2 +; RV32I-NEXT: lw a2, 4(a2) +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq a7, t1, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a1, a3 +; RV32I-NEXT: sltu t5, a2, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB17_4 +; RV32I-NEXT: beq a4, a2, .LBB17_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB17_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor t6, a7, t1 +; RV32I-NEXT: xor s0, a5, a6 ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB17_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB17_6: ; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB17_8 +; RV32I-NEXT: beq a2, a4, .LBB17_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB17_8: -; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB17_10 +; RV32I-NEXT: beq a4, a2, .LBB17_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB17_10: ; RV32I-NEXT: bnez t3, .LBB17_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a4, t0, a2 -; RV32I-NEXT: sltu a5, a6, t4 -; RV32I-NEXT: sub a2, a1, t2 -; RV32I-NEXT: sub a1, a4, a5 -; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: sub a4, a7, t0 +; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sub a3, a2, t2 +; RV32I-NEXT: sub a2, a4, a6 +; RV32I-NEXT: sub a4, a5, t4 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, t0, a2 -; RV32I-NEXT: sltu a5, a6, t6 -; RV32I-NEXT: sub a2, a4, t5 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a2, a7, t0 +; RV32I-NEXT: sltu a6, a5, t6 +; RV32I-NEXT: sub a3, a4, t5 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a4, a5, t6 ; RV32I-NEXT: .LBB17_13: -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1221,73 +1221,73 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB17_2 +; RV32ZBB-NEXT: lw a2, 4(a2) +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq a7, t1, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a1, a3 +; RV32ZBB-NEXT: sltu t5, a2, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB17_4 +; RV32ZBB-NEXT: beq a4, a2, .LBB17_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB17_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor t6, a7, t1 +; RV32ZBB-NEXT: xor s0, a5, a6 ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB17_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB17_6: ; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB17_8 +; RV32ZBB-NEXT: beq a2, a4, .LBB17_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB17_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB17_10 +; RV32ZBB-NEXT: beq a4, a2, .LBB17_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB17_10: ; RV32ZBB-NEXT: bnez t3, .LBB17_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a4, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t4 -; RV32ZBB-NEXT: sub a2, a1, t2 -; RV32ZBB-NEXT: sub a1, a4, a5 -; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a2, a2, a4 +; RV32ZBB-NEXT: sub a4, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sub a3, a2, t2 +; RV32ZBB-NEXT: sub a2, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t4 ; RV32ZBB-NEXT: j .LBB17_13 ; RV32ZBB-NEXT: .LBB17_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t6 -; RV32ZBB-NEXT: sub a2, a4, t5 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a2, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t6 +; RV32ZBB-NEXT: sub a3, a4, t5 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a4, a5, t6 ; RV32ZBB-NEXT: .LBB17_13: -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -1516,73 +1516,73 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB22_2 +; RV32I-NEXT: lw a2, 4(a2) +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq a7, t1, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a1, a3 +; RV32I-NEXT: sltu t5, a2, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB22_4 +; RV32I-NEXT: beq a4, a2, .LBB22_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB22_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor t6, a7, t1 +; RV32I-NEXT: xor s0, a5, a6 ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB22_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB22_6: ; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB22_8 +; RV32I-NEXT: beq a2, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB22_10 +; RV32I-NEXT: beq a4, a2, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB22_10: ; RV32I-NEXT: bnez t3, .LBB22_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a4, t0, a2 -; RV32I-NEXT: sltu a5, a6, t4 -; RV32I-NEXT: sub a2, a1, t2 -; RV32I-NEXT: sub a1, a4, a5 -; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: sub a4, a7, t0 +; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sub a3, a2, t2 +; RV32I-NEXT: sub a2, a4, a6 +; RV32I-NEXT: sub a4, a5, t4 ; RV32I-NEXT: j .LBB22_13 ; RV32I-NEXT: .LBB22_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, t0, a2 -; RV32I-NEXT: sltu a5, a6, t6 -; RV32I-NEXT: sub a2, a4, t5 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a2, a7, t0 +; RV32I-NEXT: sltu a6, a5, t6 +; RV32I-NEXT: sub a3, a4, t5 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a4, a5, t6 ; RV32I-NEXT: .LBB22_13: -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -1612,73 +1612,73 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB22_2 +; RV32ZBB-NEXT: lw a2, 4(a2) +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq a7, t1, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a1, a3 +; RV32ZBB-NEXT: sltu t5, a2, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB22_4 +; RV32ZBB-NEXT: beq a4, a2, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB22_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor t6, a7, t1 +; RV32ZBB-NEXT: xor s0, a5, a6 ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB22_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB22_6: ; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 +; RV32ZBB-NEXT: beq a2, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB22_10 +; RV32ZBB-NEXT: beq a4, a2, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB22_10: ; RV32ZBB-NEXT: bnez t3, .LBB22_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a4, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t4 -; RV32ZBB-NEXT: sub a2, a1, t2 -; RV32ZBB-NEXT: sub a1, a4, a5 -; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a2, a2, a4 +; RV32ZBB-NEXT: sub a4, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sub a3, a2, t2 +; RV32ZBB-NEXT: sub a2, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t4 ; RV32ZBB-NEXT: j .LBB22_13 ; RV32ZBB-NEXT: .LBB22_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t6 -; RV32ZBB-NEXT: sub a2, a4, t5 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a2, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t6 +; RV32ZBB-NEXT: sub a3, a4, t5 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a4, a5, t6 ; RV32ZBB-NEXT: .LBB22_13: -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret @@ -2539,73 +2539,73 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a5, 0(a2) -; RV32I-NEXT: lw a1, 4(a2) -; RV32I-NEXT: sltu a2, a7, a6 -; RV32I-NEXT: mv t4, a2 -; RV32I-NEXT: beq t0, t1, .LBB38_2 +; RV32I-NEXT: lw a2, 4(a2) +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq a7, t1, .LBB38_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, t0 +; RV32I-NEXT: slt t4, t1, a7 ; RV32I-NEXT: .LBB38_2: -; RV32I-NEXT: sltu t2, a5, a3 -; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a1, a3 +; RV32I-NEXT: sltu t5, a2, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a4, a1, .LBB38_4 +; RV32I-NEXT: beq a4, a2, .LBB38_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB38_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t6, t0, t1 -; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: xor t6, a7, t1 +; RV32I-NEXT: xor s0, a5, a6 ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB38_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB38_6: ; RV32I-NEXT: mv t4, t2 -; RV32I-NEXT: beq a1, a4, .LBB38_8 +; RV32I-NEXT: beq a2, a4, .LBB38_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB38_8: -; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 -; RV32I-NEXT: beq a4, a1, .LBB38_10 +; RV32I-NEXT: beq a4, a2, .LBB38_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB38_10: ; RV32I-NEXT: bnez t3, .LBB38_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sub t0, t1, t0 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a3, a5, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a4, t0, a2 -; RV32I-NEXT: sltu a5, a6, t4 -; RV32I-NEXT: sub a2, a1, t2 -; RV32I-NEXT: sub a1, a4, a5 -; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: sub a7, t1, a7 +; RV32I-NEXT: sub a5, a6, a5 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: sub a4, a7, t0 +; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sub a3, a2, t2 +; RV32I-NEXT: sub a2, a4, a6 +; RV32I-NEXT: sub a4, a5, t4 ; RV32I-NEXT: j .LBB38_13 ; RV32I-NEXT: .LBB38_12: -; RV32I-NEXT: sltu a2, a6, a7 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sub a3, a3, a5 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a1, t0, a2 -; RV32I-NEXT: sltu a5, a6, t6 -; RV32I-NEXT: sub a2, a4, t5 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a2, a7, t0 +; RV32I-NEXT: sltu a6, a5, t6 +; RV32I-NEXT: sub a3, a4, t5 +; RV32I-NEXT: sub a2, a2, a6 +; RV32I-NEXT: sub a4, a5, t6 ; RV32I-NEXT: .LBB38_13: -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) ; RV32I-NEXT: sw a4, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -2635,73 +2635,73 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a1) ; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a5, 0(a2) -; RV32ZBB-NEXT: lw a1, 4(a2) -; RV32ZBB-NEXT: sltu a2, a7, a6 -; RV32ZBB-NEXT: mv t4, a2 -; RV32ZBB-NEXT: beq t0, t1, .LBB38_2 +; RV32ZBB-NEXT: lw a2, 4(a2) +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq a7, t1, .LBB38_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: slt t4, t1, a7 ; RV32ZBB-NEXT: .LBB38_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 -; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a1, a3 +; RV32ZBB-NEXT: sltu t5, a2, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a4, a1, .LBB38_4 +; RV32ZBB-NEXT: beq a4, a2, .LBB38_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB38_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t6, t0, t1 -; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: xor t6, a7, t1 +; RV32ZBB-NEXT: xor s0, a5, a6 ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB38_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB38_6: ; RV32ZBB-NEXT: mv t4, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB38_8 +; RV32ZBB-NEXT: beq a2, a4, .LBB38_8 ; RV32ZBB-NEXT: # %bb.7: ; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB38_8: -; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 -; RV32ZBB-NEXT: beq a4, a1, .LBB38_10 +; RV32ZBB-NEXT: beq a4, a2, .LBB38_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB38_10: ; RV32ZBB-NEXT: bnez t3, .LBB38_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sub t0, t1, t0 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a3, a5, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a4, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t4 -; RV32ZBB-NEXT: sub a2, a1, t2 -; RV32ZBB-NEXT: sub a1, a4, a5 -; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: sub a7, t1, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a2, a2, a4 +; RV32ZBB-NEXT: sub a4, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sub a3, a2, t2 +; RV32ZBB-NEXT: sub a2, a4, a6 +; RV32ZBB-NEXT: sub a4, a5, t4 ; RV32ZBB-NEXT: j .LBB38_13 ; RV32ZBB-NEXT: .LBB38_12: -; RV32ZBB-NEXT: sltu a2, a6, a7 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sub a3, a3, a5 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a1, t0, a2 -; RV32ZBB-NEXT: sltu a5, a6, t6 -; RV32ZBB-NEXT: sub a2, a4, t5 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a2, a7, t0 +; RV32ZBB-NEXT: sltu a6, a5, t6 +; RV32ZBB-NEXT: sub a3, a4, t5 +; RV32ZBB-NEXT: sub a2, a2, a6 +; RV32ZBB-NEXT: sub a4, a5, t6 ; RV32ZBB-NEXT: .LBB38_13: -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) ; RV32ZBB-NEXT: sw a4, 8(a0) -; RV32ZBB-NEXT: sw a1, 12(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index 911db598eb831..713b52f53e3d9 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -1318,8 +1318,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a6, 4(a2) ; RV32I-NEXT: lw a7, 8(a2) ; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: beq a5, t0, .LBB17_2 ; RV32I-NEXT: # %bb.1: @@ -1445,8 +1445,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a6, 4(a2) ; RV32ZBB-NEXT: lw a7, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a3, 4(a1) +; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a4, 8(a1) ; RV32ZBB-NEXT: beq a5, t0, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: @@ -1767,15 +1767,15 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a2) ; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a7, .LBB22_2 +; RV32I-NEXT: sltu t0, a6, a5 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: beq t1, a7, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t4, t0, a7 +; RV32I-NEXT: sltu t4, t1, a7 ; RV32I-NEXT: .LBB22_2: ; RV32I-NEXT: sltu t2, a2, a3 ; RV32I-NEXT: mv t3, t2 @@ -1783,7 +1783,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: sltu t3, a1, a4 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a7 +; RV32I-NEXT: xor t5, t1, a7 ; RV32I-NEXT: xor t6, a6, a5 ; RV32I-NEXT: or t5, t6, t5 ; RV32I-NEXT: mv t6, t3 @@ -1799,11 +1799,11 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB22_8: ; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t1, a5, a6 -; RV32I-NEXT: sub a7, a7, t0 +; RV32I-NEXT: sltu t0, a5, a6 +; RV32I-NEXT: sub a7, a7, t1 ; RV32I-NEXT: sub a5, a5, a6 ; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sub a6, a7, t0 ; RV32I-NEXT: sltu a7, a5, t5 ; RV32I-NEXT: sub a1, a5, t5 ; RV32I-NEXT: sub a5, a4, t4 @@ -1811,10 +1811,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sub a7, t0, a7 +; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a4, a1, a4 -; RV32I-NEXT: sub a6, a7, t1 +; RV32I-NEXT: sub a6, a7, t0 ; RV32I-NEXT: sltu a7, a5, t3 ; RV32I-NEXT: sub a1, a5, t3 ; RV32I-NEXT: sub a5, a4, t2 @@ -1854,15 +1854,15 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a2) ; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a6, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a7, .LBB22_2 +; RV32ZBB-NEXT: sltu t0, a6, a5 +; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: beq t1, a7, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t4, t0, a7 +; RV32ZBB-NEXT: sltu t4, t1, a7 ; RV32ZBB-NEXT: .LBB22_2: ; RV32ZBB-NEXT: sltu t2, a2, a3 ; RV32ZBB-NEXT: mv t3, t2 @@ -1870,7 +1870,7 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: sltu t3, a1, a4 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a7 +; RV32ZBB-NEXT: xor t5, t1, a7 ; RV32ZBB-NEXT: xor t6, a6, a5 ; RV32ZBB-NEXT: or t5, t6, t5 ; RV32ZBB-NEXT: mv t6, t3 @@ -1886,11 +1886,11 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB22_8: ; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t1, a5, a6 -; RV32ZBB-NEXT: sub a7, a7, t0 +; RV32ZBB-NEXT: sltu t0, a5, a6 +; RV32ZBB-NEXT: sub a7, a7, t1 ; RV32ZBB-NEXT: sub a5, a5, a6 ; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sub a6, a7, t0 ; RV32ZBB-NEXT: sltu a7, a5, t5 ; RV32ZBB-NEXT: sub a1, a5, t5 ; RV32ZBB-NEXT: sub a5, a4, t4 @@ -1898,10 +1898,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sub a7, t0, a7 +; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a4, a1, a4 -; RV32ZBB-NEXT: sub a6, a7, t1 +; RV32ZBB-NEXT: sub a6, a7, t0 ; RV32ZBB-NEXT: sltu a7, a5, t3 ; RV32ZBB-NEXT: sub a1, a5, t3 ; RV32ZBB-NEXT: sub a5, a4, t2 diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll index b6ff3c9060af5..35a39b89a2cb7 100644 --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; ; RV32C-LABEL: add_wide_operand: ; RV32C: # %bb.0: +; RV32C-NEXT: c.lw a2, 0(a1) ; RV32C-NEXT: c.lw a4, 12(a1) -; RV32C-NEXT: c.lw a3, 0(a1) -; RV32C-NEXT: c.lw a2, 4(a1) +; RV32C-NEXT: c.lw a3, 4(a1) ; RV32C-NEXT: c.lw a1, 8(a1) ; RV32C-NEXT: c.lui a5, 16 ; RV32C-NEXT: add a6, a4, a5 -; RV32C-NEXT: srli a5, a3, 29 -; RV32C-NEXT: slli a4, a2, 3 +; RV32C-NEXT: srli a5, a2, 29 +; RV32C-NEXT: slli a4, a3, 3 ; RV32C-NEXT: c.or a4, a5 ; RV32C-NEXT: srli a5, a1, 29 -; RV32C-NEXT: c.srli a2, 29 +; RV32C-NEXT: c.srli a3, 29 ; RV32C-NEXT: c.slli a1, 3 -; RV32C-NEXT: c.slli a3, 3 +; RV32C-NEXT: c.slli a2, 3 ; RV32C-NEXT: c.slli a6, 3 -; RV32C-NEXT: c.or a1, a2 -; RV32C-NEXT: or a2, a6, a5 -; RV32C-NEXT: c.sw a3, 0(a0) +; RV32C-NEXT: c.or a1, a3 +; RV32C-NEXT: or a3, a6, a5 +; RV32C-NEXT: c.sw a2, 0(a0) ; RV32C-NEXT: c.sw a4, 4(a0) ; RV32C-NEXT: c.sw a1, 8(a0) -; RV32C-NEXT: c.sw a2, 12(a0) +; RV32C-NEXT: c.sw a3, 12(a0) ; RV32C-NEXT: c.jr ra ; ; RV64C-LABEL: add_wide_operand: diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll index f4072ffa1e3df..e6f1a08cc1879 100644 --- a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll +++ b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll @@ -213,12 +213,12 @@ define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { ; RV32I-NEXT: slli a2, a2, 3 ; RV32I-NEXT: add a1, a0, a1 ; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: lw a2, 404(a0) -; RV32I-NEXT: lw a3, 400(a1) +; RV32I-NEXT: lw a2, 400(a1) ; RV32I-NEXT: lw a1, 404(a1) +; RV32I-NEXT: lw a3, 404(a0) ; RV32I-NEXT: lw a4, 400(a0) -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: add a0, a4, a3 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a0, a4, a2 ; RV32I-NEXT: sltu a2, a0, a4 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret @@ -240,12 +240,12 @@ define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sh3add a1, a1, a0 ; RV32ZBA-NEXT: sh3add a0, a2, a0 -; RV32ZBA-NEXT: lw a2, 404(a0) -; RV32ZBA-NEXT: lw a3, 400(a1) +; RV32ZBA-NEXT: lw a2, 400(a1) ; RV32ZBA-NEXT: lw a1, 404(a1) +; RV32ZBA-NEXT: lw a3, 404(a0) ; RV32ZBA-NEXT: lw a4, 400(a0) -; RV32ZBA-NEXT: add a1, a2, a1 -; RV32ZBA-NEXT: add a0, a4, a3 +; RV32ZBA-NEXT: add a1, a3, a1 +; RV32ZBA-NEXT: add a0, a4, a2 ; RV32ZBA-NEXT: sltu a2, a0, a4 ; RV32ZBA-NEXT: add a1, a1, a2 ; RV32ZBA-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll index f60b77b92c09e..9e66eb7a2ae6c 100644 --- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll +++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll @@ -56,16 +56,16 @@ entry: define void @test3(ptr %a, ptr %b) nounwind { ; RV32-LABEL: test3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 12(a1) -; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a3, 12(a1) ; RV32-NEXT: lw a4, 4(a1) ; RV32-NEXT: lw a1, 8(a1) ; RV32-NEXT: lui a5, 524288 -; RV32-NEXT: xor a2, a2, a5 -; RV32-NEXT: sw a3, 0(a0) +; RV32-NEXT: xor a3, a3, a5 +; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a4, 4(a0) ; RV32-NEXT: sw a1, 8(a0) -; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: sw a3, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index fc714e3faef43..38cd51c074594 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -4411,25 +4411,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret @@ -4437,25 +4437,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret @@ -4463,25 +4463,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret @@ -4489,25 +4489,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -4559,25 +4559,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret @@ -4585,25 +4585,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index ddfbd649a43b8..df9d781a4536d 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -5981,25 +5981,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret @@ -6007,25 +6007,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret @@ -6033,25 +6033,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret @@ -6059,25 +6059,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -6129,25 +6129,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret @@ -6155,25 +6155,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry ; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) ; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a3 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 +; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 2(a0) ; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a3, a5, a3 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a4 +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 62bc7b3336a5c..8dd63015971d0 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -1590,8 +1590,8 @@ define i128 @sub_if_uge_i128(i128 %x, i128 %y) { ; CHECK-NEXT: lw a7, 4(a2) ; CHECK-NEXT: lw a6, 8(a2) ; CHECK-NEXT: lw t0, 12(a2) -; CHECK-NEXT: lw a4, 12(a1) ; CHECK-NEXT: lw a3, 4(a1) +; CHECK-NEXT: lw a4, 12(a1) ; CHECK-NEXT: lw a5, 8(a1) ; CHECK-NEXT: beq a4, t0, .LBB53_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll index e13f4f4b50b0f..651894a1bb661 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll @@ -26,26 +26,26 @@ define void @add_v4i32(ptr %x, ptr %y) { define void @add_v2i64(ptr %x, ptr %y) { ; RV32-LABEL: add_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a3, 4(a1) -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: lw a6, 8(a0) -; RV32-NEXT: lw a7, 12(a0) +; RV32-NEXT: lw a2, 0(a0) +; RV32-NEXT: lw a3, 4(a0) +; RV32-NEXT: lw a4, 8(a0) +; RV32-NEXT: lw a5, 12(a0) +; RV32-NEXT: lw a6, 4(a1) +; RV32-NEXT: lw a7, 0(a1) ; RV32-NEXT: lw t0, 12(a1) ; RV32-NEXT: lw a1, 8(a1) -; RV32-NEXT: add a3, a5, a3 -; RV32-NEXT: add a2, a4, a2 -; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: sltu a4, a2, a4 -; RV32-NEXT: sltu a5, a1, a6 -; RV32-NEXT: add a3, a3, a4 -; RV32-NEXT: add a5, a7, a5 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a3, 4(a0) +; RV32-NEXT: add a3, a3, a6 +; RV32-NEXT: add a7, a2, a7 +; RV32-NEXT: add a5, a5, t0 +; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: sltu a2, a7, a2 +; RV32-NEXT: sltu a4, a1, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: sw a7, 0(a0) +; RV32-NEXT: sw a2, 4(a0) ; RV32-NEXT: sw a1, 8(a0) -; RV32-NEXT: sw a5, 12(a0) +; RV32-NEXT: sw a4, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index a5eba788feeff..1fa96d3c07ca9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1406,28 +1406,28 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32VB-NEXT: slli a7, a7, 16 ; RV32VB-NEXT: slli t0, t0, 24 ; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a7, t0, a7 -; RV32VB-NEXT: or a4, a4, a5 -; RV32VB-NEXT: lbu a5, 12(a0) -; RV32VB-NEXT: lbu t0, 13(a0) ; RV32VB-NEXT: slli a6, a6, 16 ; RV32VB-NEXT: slli t1, t1, 24 -; RV32VB-NEXT: or a6, t1, a6 +; RV32VB-NEXT: or a7, t0, a7 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: or a5, t1, a6 +; RV32VB-NEXT: lbu a6, 13(a0) +; RV32VB-NEXT: lbu t0, 12(a0) ; RV32VB-NEXT: lbu t1, 14(a0) ; RV32VB-NEXT: lbu a0, 15(a0) -; RV32VB-NEXT: slli t0, t0, 8 -; RV32VB-NEXT: or a5, a5, t0 +; RV32VB-NEXT: slli a6, a6, 8 +; RV32VB-NEXT: or a6, t0, a6 ; RV32VB-NEXT: slli t1, t1, 16 ; RV32VB-NEXT: slli a0, a0, 24 ; RV32VB-NEXT: or a0, a0, t1 ; RV32VB-NEXT: or a1, a1, a3 ; RV32VB-NEXT: or a2, a2, a7 -; RV32VB-NEXT: or a3, a4, a6 -; RV32VB-NEXT: or a0, a5, a0 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: or a0, a6, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-NEXT: vslide1down.vx v8, v8, a3 +; RV32VB-NEXT: vslide1down.vx v8, v8, a4 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -1767,36 +1767,36 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32VB-NEXT: lbu a7, 55(a0) ; RV32VB-NEXT: lbu t0, 75(a0) ; RV32VB-NEXT: lbu t1, 82(a0) -; RV32VB-NEXT: lbu t2, 154(a0) -; RV32VB-NEXT: lbu t3, 161(a0) ; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a4, a4, 24 ; RV32VB-NEXT: slli a7, a7, 8 -; RV32VB-NEXT: slli a5, a5, 16 -; RV32VB-NEXT: slli t0, t0, 24 ; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: or a3, a4, a3 ; RV32VB-NEXT: or a2, a6, a7 -; RV32VB-NEXT: or a4, t0, a5 -; RV32VB-NEXT: lbu a5, 93(a0) +; RV32VB-NEXT: lbu a4, 93(a0) ; RV32VB-NEXT: lbu a6, 105(a0) ; RV32VB-NEXT: lbu a7, 124(a0) -; RV32VB-NEXT: lbu t0, 144(a0) -; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a5, t1, a5 +; RV32VB-NEXT: lbu t2, 144(a0) +; RV32VB-NEXT: slli a5, a5, 16 +; RV32VB-NEXT: slli t0, t0, 24 +; RV32VB-NEXT: slli a4, a4, 8 +; RV32VB-NEXT: or a5, t0, a5 +; RV32VB-NEXT: or a4, t1, a4 +; RV32VB-NEXT: lbu t0, 161(a0) +; RV32VB-NEXT: lbu t1, 154(a0) ; RV32VB-NEXT: lbu a0, 163(a0) ; RV32VB-NEXT: slli a6, a6, 16 -; RV32VB-NEXT: slli t3, t3, 24 -; RV32VB-NEXT: or a6, t3, a6 +; RV32VB-NEXT: slli t0, t0, 24 +; RV32VB-NEXT: or a6, t0, a6 ; RV32VB-NEXT: slli a0, a0, 8 ; RV32VB-NEXT: or a0, a7, a0 -; RV32VB-NEXT: slli t0, t0, 16 -; RV32VB-NEXT: slli t2, t2, 24 -; RV32VB-NEXT: or a7, t2, t0 +; RV32VB-NEXT: slli t2, t2, 16 +; RV32VB-NEXT: slli t1, t1, 24 +; RV32VB-NEXT: or a7, t1, t2 ; RV32VB-NEXT: or a1, a1, a3 -; RV32VB-NEXT: or a2, a2, a4 -; RV32VB-NEXT: or a3, a5, a6 +; RV32VB-NEXT: or a2, a2, a5 +; RV32VB-NEXT: or a3, a4, a6 ; RV32VB-NEXT: or a0, a0, a7 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 @@ -1893,50 +1893,50 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_loads_gather: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a6, 0(a0) -; RVA22U64-NEXT: lbu a7, 1(a0) -; RVA22U64-NEXT: lbu t0, 22(a0) -; RVA22U64-NEXT: lbu t1, 31(a0) -; RVA22U64-NEXT: lbu t2, 623(a0) -; RVA22U64-NEXT: lbu a1, 44(a0) -; RVA22U64-NEXT: lbu a2, 55(a0) -; RVA22U64-NEXT: lbu a3, 75(a0) -; RVA22U64-NEXT: lbu t3, 82(a0) -; RVA22U64-NEXT: lbu t4, 154(a0) -; RVA22U64-NEXT: lbu t5, 161(a0) -; RVA22U64-NEXT: slli a7, a7, 8 -; RVA22U64-NEXT: slli t0, t0, 16 -; RVA22U64-NEXT: slli t1, t1, 24 -; RVA22U64-NEXT: slli a1, a1, 32 -; RVA22U64-NEXT: slli a2, a2, 40 -; RVA22U64-NEXT: slli t2, t2, 48 -; RVA22U64-NEXT: slli a3, a3, 56 -; RVA22U64-NEXT: or a6, a6, a7 -; RVA22U64-NEXT: or t0, t1, t0 -; RVA22U64-NEXT: or a7, a2, a1 -; RVA22U64-NEXT: or a2, a3, t2 -; RVA22U64-NEXT: lbu a3, 93(a0) -; RVA22U64-NEXT: lbu a5, 105(a0) -; RVA22U64-NEXT: lbu a1, 124(a0) -; RVA22U64-NEXT: lbu a4, 144(a0) -; RVA22U64-NEXT: slli a3, a3, 8 -; RVA22U64-NEXT: or a3, t3, a3 +; RVA22U64-NEXT: lbu a7, 0(a0) +; RVA22U64-NEXT: lbu a2, 1(a0) +; RVA22U64-NEXT: lbu a3, 22(a0) +; RVA22U64-NEXT: lbu a4, 31(a0) +; RVA22U64-NEXT: lbu a6, 623(a0) +; RVA22U64-NEXT: lbu a5, 44(a0) +; RVA22U64-NEXT: lbu a1, 55(a0) +; RVA22U64-NEXT: lbu t0, 75(a0) +; RVA22U64-NEXT: lbu t1, 82(a0) +; RVA22U64-NEXT: slli a2, a2, 8 +; RVA22U64-NEXT: slli a3, a3, 16 +; RVA22U64-NEXT: slli a4, a4, 24 +; RVA22U64-NEXT: slli a5, a5, 32 +; RVA22U64-NEXT: slli a1, a1, 40 +; RVA22U64-NEXT: or a7, a7, a2 +; RVA22U64-NEXT: or t3, a4, a3 +; RVA22U64-NEXT: or t2, a1, a5 +; RVA22U64-NEXT: lbu a4, 93(a0) +; RVA22U64-NEXT: lbu t4, 105(a0) +; RVA22U64-NEXT: lbu a2, 124(a0) +; RVA22U64-NEXT: lbu t5, 144(a0) +; RVA22U64-NEXT: slli a6, a6, 48 +; RVA22U64-NEXT: slli t0, t0, 56 +; RVA22U64-NEXT: slli a4, a4, 8 +; RVA22U64-NEXT: or a3, t0, a6 +; RVA22U64-NEXT: or a4, t1, a4 +; RVA22U64-NEXT: lbu a5, 161(a0) +; RVA22U64-NEXT: lbu a1, 154(a0) ; RVA22U64-NEXT: lbu a0, 163(a0) -; RVA22U64-NEXT: slli a5, a5, 16 -; RVA22U64-NEXT: slli t5, t5, 24 -; RVA22U64-NEXT: or a5, t5, a5 -; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: slli t4, t4, 16 +; RVA22U64-NEXT: slli a5, a5, 24 +; RVA22U64-NEXT: or a5, a5, t4 +; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a0, a0, 40 +; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: slli t5, t5, 48 +; RVA22U64-NEXT: slli a1, a1, 56 +; RVA22U64-NEXT: or a1, a1, t5 +; RVA22U64-NEXT: or a2, a7, t3 +; RVA22U64-NEXT: or a3, a3, t2 +; RVA22U64-NEXT: or a4, a4, a5 ; RVA22U64-NEXT: or a0, a0, a1 -; RVA22U64-NEXT: slli a4, a4, 48 -; RVA22U64-NEXT: slli t4, t4, 56 -; RVA22U64-NEXT: or a1, t4, a4 -; RVA22U64-NEXT: or a4, a6, t0 -; RVA22U64-NEXT: or a2, a2, a7 -; RVA22U64-NEXT: or a3, a3, a5 -; RVA22U64-NEXT: or a0, a0, a1 -; RVA22U64-NEXT: or a2, a2, a4 -; RVA22U64-NEXT: or a0, a0, a3 +; RVA22U64-NEXT: or a2, a2, a3 +; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.v.x v8, a2 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 @@ -2111,24 +2111,24 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_undef_low_half: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 82(a0) -; RV32VB-NEXT: lbu a2, 93(a0) -; RV32VB-NEXT: lbu a3, 144(a0) -; RV32VB-NEXT: lbu a4, 154(a0) -; RV32VB-NEXT: lbu a5, 161(a0) -; RV32VB-NEXT: lbu a6, 105(a0) -; RV32VB-NEXT: lbu a7, 124(a0) -; RV32VB-NEXT: slli a2, a2, 8 -; RV32VB-NEXT: or a1, a1, a2 +; RV32VB-NEXT: lbu a1, 93(a0) +; RV32VB-NEXT: lbu a2, 82(a0) +; RV32VB-NEXT: lbu a3, 105(a0) +; RV32VB-NEXT: lbu a4, 124(a0) +; RV32VB-NEXT: slli a1, a1, 8 +; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: lbu a2, 161(a0) +; RV32VB-NEXT: lbu a5, 144(a0) +; RV32VB-NEXT: lbu a6, 154(a0) ; RV32VB-NEXT: lbu a0, 163(a0) -; RV32VB-NEXT: slli a6, a6, 16 -; RV32VB-NEXT: slli a5, a5, 24 -; RV32VB-NEXT: or a2, a5, a6 -; RV32VB-NEXT: slli a0, a0, 8 -; RV32VB-NEXT: or a0, a7, a0 ; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a4, a4, 24 -; RV32VB-NEXT: or a3, a4, a3 +; RV32VB-NEXT: slli a2, a2, 24 +; RV32VB-NEXT: or a2, a2, a3 +; RV32VB-NEXT: slli a0, a0, 8 +; RV32VB-NEXT: or a0, a4, a0 +; RV32VB-NEXT: slli a5, a5, 16 +; RV32VB-NEXT: slli a6, a6, 24 +; RV32VB-NEXT: or a3, a6, a5 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.i v8, 0 ; RV32VB-NEXT: or a1, a1, a2 @@ -2186,28 +2186,28 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_undef_low_half: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a6, 82(a0) -; RVA22U64-NEXT: lbu a7, 93(a0) -; RVA22U64-NEXT: lbu t0, 144(a0) -; RVA22U64-NEXT: lbu a4, 154(a0) -; RVA22U64-NEXT: lbu a5, 161(a0) -; RVA22U64-NEXT: lbu a1, 105(a0) -; RVA22U64-NEXT: lbu a2, 124(a0) -; RVA22U64-NEXT: slli a7, a7, 8 -; RVA22U64-NEXT: or a3, a6, a7 +; RVA22U64-NEXT: lbu a1, 93(a0) +; RVA22U64-NEXT: lbu a2, 82(a0) +; RVA22U64-NEXT: lbu a3, 105(a0) +; RVA22U64-NEXT: lbu a4, 124(a0) +; RVA22U64-NEXT: slli a1, a1, 8 +; RVA22U64-NEXT: or a6, a2, a1 +; RVA22U64-NEXT: lbu a2, 161(a0) +; RVA22U64-NEXT: lbu a5, 144(a0) +; RVA22U64-NEXT: lbu a1, 154(a0) ; RVA22U64-NEXT: lbu a0, 163(a0) -; RVA22U64-NEXT: slli a1, a1, 16 -; RVA22U64-NEXT: slli a5, a5, 24 -; RVA22U64-NEXT: or a1, a1, a5 -; RVA22U64-NEXT: slli a2, a2, 32 +; RVA22U64-NEXT: slli a3, a3, 16 +; RVA22U64-NEXT: slli a2, a2, 24 +; RVA22U64-NEXT: or a2, a2, a3 +; RVA22U64-NEXT: slli a4, a4, 32 ; RVA22U64-NEXT: slli a0, a0, 40 -; RVA22U64-NEXT: or a0, a0, a2 -; RVA22U64-NEXT: slli t0, t0, 48 -; RVA22U64-NEXT: slli a4, a4, 56 -; RVA22U64-NEXT: or a2, a4, t0 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a0, a0, a4 +; RVA22U64-NEXT: slli a5, a5, 48 +; RVA22U64-NEXT: slli a1, a1, 56 +; RVA22U64-NEXT: or a1, a1, a5 +; RVA22U64-NEXT: or a2, a6, a2 ; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.v.i v8, 0 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 @@ -2520,30 +2520,30 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_undef_edges: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 55(a0) -; RV32VB-NEXT: lbu a2, 31(a0) -; RV32VB-NEXT: lbu a3, 44(a0) -; RV32VB-NEXT: lbu a4, 623(a0) -; RV32VB-NEXT: lbu a5, 75(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a3, a1 -; RV32VB-NEXT: lbu a3, 82(a0) -; RV32VB-NEXT: lbu a6, 93(a0) -; RV32VB-NEXT: slli a4, a4, 16 -; RV32VB-NEXT: slli a5, a5, 24 -; RV32VB-NEXT: or a4, a5, a4 -; RV32VB-NEXT: lbu a5, 105(a0) +; RV32VB-NEXT: lbu a1, 623(a0) +; RV32VB-NEXT: lbu a2, 55(a0) +; RV32VB-NEXT: lbu a3, 31(a0) +; RV32VB-NEXT: lbu a4, 75(a0) +; RV32VB-NEXT: lbu a5, 44(a0) +; RV32VB-NEXT: slli a2, a2, 8 +; RV32VB-NEXT: slli a1, a1, 16 +; RV32VB-NEXT: slli a4, a4, 24 +; RV32VB-NEXT: or a2, a5, a2 +; RV32VB-NEXT: or a1, a4, a1 +; RV32VB-NEXT: lbu a4, 93(a0) +; RV32VB-NEXT: lbu a5, 82(a0) +; RV32VB-NEXT: lbu a6, 105(a0) ; RV32VB-NEXT: lbu a0, 161(a0) -; RV32VB-NEXT: slli a6, a6, 8 -; RV32VB-NEXT: or a3, a3, a6 -; RV32VB-NEXT: slli a5, a5, 16 +; RV32VB-NEXT: slli a4, a4, 8 +; RV32VB-NEXT: or a4, a5, a4 +; RV32VB-NEXT: slli a6, a6, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a5 -; RV32VB-NEXT: slli a2, a2, 24 -; RV32VB-NEXT: or a1, a1, a4 -; RV32VB-NEXT: or a0, a3, a0 +; RV32VB-NEXT: or a0, a0, a6 +; RV32VB-NEXT: slli a3, a3, 24 +; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: or a0, a4, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-NEXT: vmv.v.x v8, a2 +; RV32VB-NEXT: vmv.v.x v8, a3 ; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: vslide1down.vx v8, v8, zero @@ -2607,32 +2607,32 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_undef_edges: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a6, 31(a0) +; RVA22U64-NEXT: lbu a1, 623(a0) ; RVA22U64-NEXT: lbu a2, 44(a0) ; RVA22U64-NEXT: lbu a3, 55(a0) -; RVA22U64-NEXT: lbu a4, 623(a0) +; RVA22U64-NEXT: lbu a6, 31(a0) ; RVA22U64-NEXT: lbu a5, 75(a0) ; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a3, a3, 40 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: lbu a3, 82(a0) -; RVA22U64-NEXT: lbu a1, 93(a0) -; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: slli a1, a1, 48 ; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: or a4, a4, a5 -; RVA22U64-NEXT: lbu a5, 105(a0) +; RVA22U64-NEXT: or a2, a2, a3 +; RVA22U64-NEXT: or a1, a1, a5 +; RVA22U64-NEXT: lbu a3, 93(a0) +; RVA22U64-NEXT: lbu a5, 82(a0) +; RVA22U64-NEXT: lbu a4, 105(a0) ; RVA22U64-NEXT: lbu a0, 161(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: slli a5, a5, 16 +; RVA22U64-NEXT: slli a3, a3, 8 +; RVA22U64-NEXT: or a3, a3, a5 +; RVA22U64-NEXT: slli a4, a4, 16 ; RVA22U64-NEXT: slli a0, a0, 24 -; RVA22U64-NEXT: or a0, a0, a5 +; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: slli a6, a6, 24 -; RVA22U64-NEXT: or a2, a2, a4 -; RVA22U64-NEXT: add.uw a2, a6, a2 -; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: add.uw a1, a6, a1 +; RVA22U64-NEXT: or a0, a0, a3 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a2 +; RVA22U64-NEXT: vmv.v.x v8, a1 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; @@ -2794,26 +2794,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 44(a0) ; RV32VB-PACK-NEXT: lbu a4, 55(a0) -; RV32VB-PACK-NEXT: lbu a5, 75(a0) -; RV32VB-PACK-NEXT: lbu a6, 82(a0) -; RV32VB-PACK-NEXT: lbu a7, 93(a0) +; RV32VB-PACK-NEXT: lbu a5, 82(a0) +; RV32VB-PACK-NEXT: lbu a6, 93(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 ; RV32VB-PACK-NEXT: lbu a2, 144(a0) -; RV32VB-PACK-NEXT: lbu t0, 154(a0) +; RV32VB-PACK-NEXT: lbu a7, 154(a0) ; RV32VB-PACK-NEXT: packh a3, a3, a4 +; RV32VB-PACK-NEXT: lbu a4, 75(a0) ; RV32VB-PACK-NEXT: lbu a0, 124(a0) -; RV32VB-PACK-NEXT: packh a4, a6, a7 -; RV32VB-PACK-NEXT: packh a2, a2, t0 -; RV32VB-PACK-NEXT: packh a5, a0, a5 -; RV32VB-PACK-NEXT: pack a3, a3, a5 -; RV32VB-PACK-NEXT: packh a5, a0, a0 +; RV32VB-PACK-NEXT: packh a5, a5, a6 +; RV32VB-PACK-NEXT: packh a2, a2, a7 +; RV32VB-PACK-NEXT: packh a4, a0, a4 +; RV32VB-PACK-NEXT: pack a3, a3, a4 +; RV32VB-PACK-NEXT: packh a4, a0, a0 ; RV32VB-PACK-NEXT: packh a0, a0, a0 ; RV32VB-PACK-NEXT: pack a0, a0, a2 -; RV32VB-PACK-NEXT: pack a1, a1, a5 +; RV32VB-PACK-NEXT: pack a1, a1, a4 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 -; RV32VB-PACK-NEXT: pack a1, a4, a5 +; RV32VB-PACK-NEXT: pack a1, a5, a4 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret @@ -2888,23 +2888,23 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RVA22U64-PACK: # %bb.0: ; RVA22U64-PACK-NEXT: lbu a1, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a7, 44(a0) -; RVA22U64-PACK-NEXT: lbu t0, 55(a0) -; RVA22U64-PACK-NEXT: lbu a6, 75(a0) -; RVA22U64-PACK-NEXT: lbu a5, 82(a0) +; RVA22U64-PACK-NEXT: lbu a6, 44(a0) +; RVA22U64-PACK-NEXT: lbu a7, 55(a0) +; RVA22U64-PACK-NEXT: lbu t1, 82(a0) ; RVA22U64-PACK-NEXT: lbu a3, 93(a0) -; RVA22U64-PACK-NEXT: packh t1, a1, a2 +; RVA22U64-PACK-NEXT: packh t0, a1, a2 ; RVA22U64-PACK-NEXT: lbu a2, 144(a0) ; RVA22U64-PACK-NEXT: lbu a4, 154(a0) -; RVA22U64-PACK-NEXT: packh a1, a7, t0 +; RVA22U64-PACK-NEXT: packh a1, a6, a7 +; RVA22U64-PACK-NEXT: lbu a5, 75(a0) ; RVA22U64-PACK-NEXT: lbu a0, 124(a0) -; RVA22U64-PACK-NEXT: packh a3, a5, a3 +; RVA22U64-PACK-NEXT: packh a3, t1, a3 ; RVA22U64-PACK-NEXT: packh a2, a2, a4 -; RVA22U64-PACK-NEXT: packh a4, a0, a6 +; RVA22U64-PACK-NEXT: packh a4, a0, a5 ; RVA22U64-PACK-NEXT: packw a1, a1, a4 ; RVA22U64-PACK-NEXT: packh a4, a0, a0 ; RVA22U64-PACK-NEXT: packh a0, a0, a0 -; RVA22U64-PACK-NEXT: packw a5, t1, a4 +; RVA22U64-PACK-NEXT: packw a5, t0, a4 ; RVA22U64-PACK-NEXT: packw a0, a0, a2 ; RVA22U64-PACK-NEXT: packw a2, a3, a4 ; RVA22U64-PACK-NEXT: pack a1, a5, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index a6c8fa5931cae..533b8b6864ebc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -14191,37 +14191,37 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: lbu t0, 13(a0) ; RV64ZVE32F-NEXT: slli a2, a2, 8 ; RV64ZVE32F-NEXT: slli a4, a4, 8 -; RV64ZVE32F-NEXT: slli a6, a6, 8 ; RV64ZVE32F-NEXT: or a1, a2, a1 ; RV64ZVE32F-NEXT: or a3, a4, a3 -; RV64ZVE32F-NEXT: or a2, a6, a5 -; RV64ZVE32F-NEXT: lbu a4, 16(a0) -; RV64ZVE32F-NEXT: lbu a5, 17(a0) -; RV64ZVE32F-NEXT: lbu a6, 20(a0) -; RV64ZVE32F-NEXT: lbu t1, 21(a0) +; RV64ZVE32F-NEXT: lbu a2, 16(a0) +; RV64ZVE32F-NEXT: lbu a4, 17(a0) +; RV64ZVE32F-NEXT: lbu t1, 20(a0) +; RV64ZVE32F-NEXT: lbu t2, 21(a0) +; RV64ZVE32F-NEXT: slli a6, a6, 8 +; RV64ZVE32F-NEXT: or a5, a6, a5 ; RV64ZVE32F-NEXT: slli t0, t0, 8 -; RV64ZVE32F-NEXT: slli a5, a5, 8 -; RV64ZVE32F-NEXT: or a7, t0, a7 -; RV64ZVE32F-NEXT: or a4, a5, a4 -; RV64ZVE32F-NEXT: lbu a5, 24(a0) -; RV64ZVE32F-NEXT: lbu t0, 25(a0) -; RV64ZVE32F-NEXT: slli t1, t1, 8 -; RV64ZVE32F-NEXT: or a6, t1, a6 +; RV64ZVE32F-NEXT: slli a4, a4, 8 +; RV64ZVE32F-NEXT: slli t2, t2, 8 +; RV64ZVE32F-NEXT: or a6, t0, a7 +; RV64ZVE32F-NEXT: or a2, a4, a2 +; RV64ZVE32F-NEXT: or a4, t2, t1 +; RV64ZVE32F-NEXT: lbu a7, 25(a0) +; RV64ZVE32F-NEXT: lbu t0, 24(a0) ; RV64ZVE32F-NEXT: lbu t1, 28(a0) ; RV64ZVE32F-NEXT: lbu a0, 29(a0) -; RV64ZVE32F-NEXT: slli t0, t0, 8 -; RV64ZVE32F-NEXT: or a5, t0, a5 +; RV64ZVE32F-NEXT: slli a7, a7, 8 +; RV64ZVE32F-NEXT: or a7, a7, t0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: slli a0, a0, 8 ; RV64ZVE32F-NEXT: or a0, a0, t1 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a4 +; RV64ZVE32F-NEXT: vmv.v.x v9, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a6 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll index f3e240eead817..b6267bf481c85 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll @@ -1651,67 +1651,67 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal ; ; ZVE32F-LABEL: unzip2a_dual_v16i64_exact: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a6, 64(a1) -; ZVE32F-NEXT: ld a4, 80(a1) -; ZVE32F-NEXT: ld a7, 96(a1) -; ZVE32F-NEXT: ld t0, 0(a2) -; ZVE32F-NEXT: ld a3, 16(a2) -; ZVE32F-NEXT: ld t1, 32(a2) -; ZVE32F-NEXT: ld a5, 112(a1) -; ZVE32F-NEXT: srli t2, a7, 32 -; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; ZVE32F-NEXT: vmv.v.x v8, a6 -; ZVE32F-NEXT: srli a6, a6, 32 -; ZVE32F-NEXT: vmv.v.x v9, a7 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; ZVE32F-NEXT: vslide1down.vx v9, v9, t2 ; ZVE32F-NEXT: ld a6, 0(a1) -; ZVE32F-NEXT: ld a7, 16(a1) -; ZVE32F-NEXT: ld t2, 32(a1) -; ZVE32F-NEXT: ld a1, 48(a1) +; ZVE32F-NEXT: ld a4, 16(a1) +; ZVE32F-NEXT: ld a7, 32(a1) +; ZVE32F-NEXT: ld a3, 48(a1) +; ZVE32F-NEXT: ld a5, 80(a1) +; ZVE32F-NEXT: ld t0, 96(a1) +; ZVE32F-NEXT: ld t1, 64(a1) +; ZVE32F-NEXT: ld a1, 112(a1) +; ZVE32F-NEXT: srli t2, a6, 32 +; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; ZVE32F-NEXT: vmv.v.x v8, t0 +; ZVE32F-NEXT: srli t0, t0, 32 +; ZVE32F-NEXT: vmv.v.x v9, t1 +; ZVE32F-NEXT: srli t1, t1, 32 ; ZVE32F-NEXT: vmv.v.x v10, a6 -; ZVE32F-NEXT: srli a6, a6, 32 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a6 +; ZVE32F-NEXT: vslide1down.vx v9, v9, t1 +; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 +; ZVE32F-NEXT: vslide1down.vx v10, v10, t2 +; ZVE32F-NEXT: ld t1, 32(a2) +; ZVE32F-NEXT: ld t0, 16(a2) +; ZVE32F-NEXT: ld t2, 0(a2) ; ZVE32F-NEXT: ld a6, 48(a2) ; ZVE32F-NEXT: vmv.v.x v11, t1 ; ZVE32F-NEXT: srli t1, t1, 32 -; ZVE32F-NEXT: vmv.v.x v12, t0 -; ZVE32F-NEXT: srli t0, t0, 32 -; ZVE32F-NEXT: vmv.v.x v13, t2 +; ZVE32F-NEXT: vmv.v.x v12, t2 ; ZVE32F-NEXT: srli t2, t2, 32 -; ZVE32F-NEXT: vslide1down.vx v13, v13, t2 -; ZVE32F-NEXT: vslide1down.vx v12, v12, t0 +; ZVE32F-NEXT: vmv.v.x v13, a7 +; ZVE32F-NEXT: srli a7, a7, 32 +; ZVE32F-NEXT: vslide1down.vx v13, v13, a7 +; ZVE32F-NEXT: vslide1down.vx v12, v12, t2 ; ZVE32F-NEXT: vslide1down.vx v11, v11, t1 -; ZVE32F-NEXT: ld t0, 64(a2) +; ZVE32F-NEXT: ld a7, 64(a2) ; ZVE32F-NEXT: ld t1, 80(a2) ; ZVE32F-NEXT: ld t2, 96(a2) ; ZVE32F-NEXT: ld a2, 112(a2) -; ZVE32F-NEXT: vmv.v.x v14, t0 -; ZVE32F-NEXT: srli t0, t0, 32 -; ZVE32F-NEXT: vslide1down.vx v14, v14, t0 +; ZVE32F-NEXT: vmv.v.x v14, a7 +; ZVE32F-NEXT: srli a7, a7, 32 +; ZVE32F-NEXT: vslide1down.vx v14, v14, a7 ; ZVE32F-NEXT: vmv.v.x v15, t2 -; ZVE32F-NEXT: srli t0, t2, 32 -; ZVE32F-NEXT: vslide1down.vx v15, v15, t0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; ZVE32F-NEXT: srli a4, a4, 32 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; ZVE32F-NEXT: srli a7, t2, 32 +; ZVE32F-NEXT: vslide1down.vx v15, v15, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 ; ZVE32F-NEXT: srli a5, a5, 32 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a7 -; ZVE32F-NEXT: srli a4, a7, 32 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a4 -; ZVE32F-NEXT: vslide1down.vx v12, v12, a3 -; ZVE32F-NEXT: srli a3, a3, 32 -; ZVE32F-NEXT: vslide1down.vx v12, v12, a3 -; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: vslide1down.vx v14, v14, t1 -; ZVE32F-NEXT: srli a3, t1, 32 -; ZVE32F-NEXT: vslide1down.vx v14, v14, a3 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v8, v13, a1 -; ZVE32F-NEXT: srli a1, a1, 32 +; ZVE32F-NEXT: vslide1down.vx v16, v9, a5 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: srli a1, a1, 32 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v8, v10, a4 +; ZVE32F-NEXT: srli a4, a4, 32 +; ZVE32F-NEXT: vslide1down.vx v10, v8, a4 +; ZVE32F-NEXT: vslide1down.vx v8, v12, t0 +; ZVE32F-NEXT: srli a1, t0, 32 +; ZVE32F-NEXT: vslide1down.vx v12, v8, a1 +; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v8, v14, t1 +; ZVE32F-NEXT: srli a1, t1, 32 +; ZVE32F-NEXT: vslide1down.vx v14, v8, a1 +; ZVE32F-NEXT: vslidedown.vi v9, v16, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v8, v13, a3 +; ZVE32F-NEXT: srli a3, a3, 32 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; ZVE32F-NEXT: vslide1down.vx v10, v11, a6 ; ZVE32F-NEXT: srli a1, a6, 32 diff --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll index a212714db53e0..56c876a2409d2 100644 --- a/llvm/test/CodeGen/RISCV/scmp.ll +++ b/llvm/test/CodeGen/RISCV/scmp.ll @@ -89,8 +89,8 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a7, 8(a0) ; RV32I-NEXT: beq a6, a5, .LBB4_2 ; RV32I-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 2be3324b86032..30ffaf6c7ceca 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -562,49 +562,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 16(a1) -; RV64IM-NEXT: lh a3, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) +; RV64IM-NEXT: lui a2, %hi(.LCPI2_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI2_0)(a2) +; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a4, 24(a1) ; RV64IM-NEXT: lh a5, 0(a1) ; RV64IM-NEXT: lh a1, 8(a1) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mulh a7, a3, a4 -; RV64IM-NEXT: mulh t0, a2, a4 -; RV64IM-NEXT: mulh t1, a1, a4 -; RV64IM-NEXT: mulh a4, a5, a4 -; RV64IM-NEXT: add a7, a7, a3 -; RV64IM-NEXT: add t0, t0, a2 +; RV64IM-NEXT: mulh a7, a4, a2 +; RV64IM-NEXT: mulh t0, a3, a2 +; RV64IM-NEXT: mulh t1, a1, a2 +; RV64IM-NEXT: mulh a2, a5, a2 +; RV64IM-NEXT: add a7, a7, a4 +; RV64IM-NEXT: add t0, t0, a3 ; RV64IM-NEXT: add t1, t1, a1 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: srli t2, a7, 63 ; RV64IM-NEXT: srai a7, a7, 6 ; RV64IM-NEXT: srli t3, t0, 63 ; RV64IM-NEXT: srai t0, t0, 6 ; RV64IM-NEXT: srli t4, t1, 63 ; RV64IM-NEXT: srai t1, t1, 6 -; RV64IM-NEXT: srli t5, a4, 63 -; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: srli t5, a2, 63 +; RV64IM-NEXT: srai a2, a2, 6 ; RV64IM-NEXT: add a7, a7, t2 ; RV64IM-NEXT: add t0, t0, t3 ; RV64IM-NEXT: add t1, t1, t4 -; RV64IM-NEXT: add a4, a4, t5 +; RV64IM-NEXT: add a2, a2, t5 ; RV64IM-NEXT: mul t2, a7, a6 ; RV64IM-NEXT: mul t3, t0, a6 ; RV64IM-NEXT: mul t4, t1, a6 -; RV64IM-NEXT: mul a6, a4, a6 -; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a6, a2, a6 +; RV64IM-NEXT: add a2, a5, a2 ; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: add a2, a2, t0 -; RV64IM-NEXT: add a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: add a3, a3, t0 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: subw a2, a2, a6 ; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a2, a2, t3 -; RV64IM-NEXT: subw a3, a3, t2 -; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: subw a3, a3, t3 +; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a4, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll index 50da56fbc5951..0a400b1c04a3f 100644 --- a/llvm/test/CodeGen/RISCV/ucmp.ll +++ b/llvm/test/CodeGen/RISCV/ucmp.ll @@ -89,8 +89,8 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a7, 8(a0) ; RV32I-NEXT: beq a6, a5, .LBB4_2 ; RV32I-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index 068b9f7620021..c9c49e8f7f532 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -133,49 +133,49 @@ define i64 @load_i64(ptr %p) { ; RV32I-LABEL: load_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 0(a0) -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: lbu a4, 3(a0) +; RV32I-NEXT: lbu a2, 2(a0) +; RV32I-NEXT: lbu a3, 3(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: lbu a2, 4(a0) -; RV32I-NEXT: lbu a5, 5(a0) -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 6(a0) +; RV32I-NEXT: slli a2, a2, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: lbu a3, 5(a0) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a2, a5, a2 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a4, a0, a4 -; RV32I-NEXT: or a0, a3, a1 -; RV32I-NEXT: or a1, a4, a2 +; RV32I-NEXT: or a5, a0, a5 +; RV32I-NEXT: or a0, a2, a1 +; RV32I-NEXT: or a1, a5, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: load_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 0(a0) -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: lbu a4, 3(a0) +; RV64I-NEXT: lbu a2, 2(a0) +; RV64I-NEXT: lbu a3, 3(a0) +; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a2 -; RV64I-NEXT: lbu a2, 4(a0) -; RV64I-NEXT: lbu a5, 5(a0) -; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: lbu a4, 6(a0) +; RV64I-NEXT: slli a2, a2, 16 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: or a2, a3, a2 +; RV64I-NEXT: lbu a3, 5(a0) +; RV64I-NEXT: lbu a4, 4(a0) +; RV64I-NEXT: lbu a5, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a2, a5, a2 -; RV64I-NEXT: slli a4, a4, 16 +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index e8e91f0baee14..3ef9f3f945108 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -489,33 +489,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 16(a1) -; RV64IM-NEXT: lhu a3, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) +; RV64IM-NEXT: lui a2, %hi(.LCPI2_0) +; RV64IM-NEXT: ld a2, %lo(.LCPI2_0)(a2) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a4, 24(a1) ; RV64IM-NEXT: lhu a5, 0(a1) ; RV64IM-NEXT: lhu a1, 8(a1) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mulhu a7, a3, a4 -; RV64IM-NEXT: mulhu t0, a2, a4 -; RV64IM-NEXT: mulhu t1, a1, a4 -; RV64IM-NEXT: mulhu a4, a5, a4 +; RV64IM-NEXT: mulhu a7, a4, a2 +; RV64IM-NEXT: mulhu t0, a3, a2 +; RV64IM-NEXT: mulhu t1, a1, a2 +; RV64IM-NEXT: mulhu a2, a5, a2 ; RV64IM-NEXT: mul t2, a7, a6 ; RV64IM-NEXT: mul t3, t0, a6 ; RV64IM-NEXT: mul t4, t1, a6 -; RV64IM-NEXT: mul a6, a4, a6 -; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a6, a2, a6 +; RV64IM-NEXT: add a2, a5, a2 ; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: add a2, a2, t0 -; RV64IM-NEXT: add a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: add a3, a3, t0 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: subw a2, a2, a6 ; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a2, a2, t3 -; RV64IM-NEXT: subw a3, a3, t2 -; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: subw a3, a3, t3 +; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a4, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 2a0228f95f1cd..09b2eeb19a69c 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -30,25 +30,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -94,25 +94,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -158,25 +158,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -220,24 +220,24 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t1, 0(a1) ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -264,26 +264,26 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: lbu a4, 6(a0) +; RV32I-NEXT: lbu a5, 7(a0) +; RV32I-NEXT: lbu a6, 4(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: or a5, a5, a3 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: slli a4, a1, 3 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: slli a4, a4, 3 ; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: srl a1, a5, a4 ; RV32I-NEXT: bltz a3, .LBB3_2 @@ -356,24 +356,24 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t1, 0(a1) ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -400,26 +400,26 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: or a5, a5, a3 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: slli a4, a1, 3 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 +; RV32I-NEXT: slli a4, a4, 3 ; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: sll a1, a5, a4 ; RV32I-NEXT: bltz a3, .LBB4_2 @@ -492,24 +492,24 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t1, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t1, 0(a1) ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -535,21 +535,21 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a3, 5(a0) +; RV32I-NEXT: lbu a4, 4(a0) ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) +; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a7, a4, a7 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: slli a4, a5, 16 ; RV32I-NEXT: slli a5, a6, 24 ; RV32I-NEXT: or a4, a5, a4 @@ -629,24 +629,24 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a6, a5, 35 @@ -660,31 +660,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 -; RV64I-NEXT: sll a4, a5, t0 +; RV64I-NEXT: sll a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB6_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -787,27 +787,27 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: add a1, t2, a1 ; RV32I-NEXT: andi a3, a0, 24 -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 0(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srl a7, a5, a0 -; RV32I-NEXT: slli t0, a6, 1 -; RV32I-NEXT: srl a4, a4, a0 -; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a7, a4, a0 +; RV32I-NEXT: slli t0, a5, 1 ; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a0 ; RV32I-NEXT: slli t1, a1, 1 ; RV32I-NEXT: srl a0, a1, a0 ; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a4, a4, a3 ; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 ; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -815,18 +815,18 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a3, 16 ; RV32I-NEXT: srli t0, a3, 24 ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: srli t1, a5, 16 -; RV32I-NEXT: srli t2, a5, 24 -; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: srli t1, a4, 16 +; RV32I-NEXT: srli t2, a4, 24 +; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: srli t3, a1, 16 ; RV32I-NEXT: srli t4, a1, 24 ; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a5, 8(a2) ; RV32I-NEXT: sb a3, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) ; RV32I-NEXT: sb t0, 11(a2) -; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb a6, 0(a2) +; RV32I-NEXT: sb a4, 1(a2) ; RV32I-NEXT: sb t1, 2(a2) ; RV32I-NEXT: sb t2, 3(a2) ; RV32I-NEXT: sb a7, 4(a2) @@ -868,24 +868,24 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a6, a5, 37 @@ -899,31 +899,31 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 -; RV64I-NEXT: sll a4, a5, t0 +; RV64I-NEXT: sll a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB7_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -1083,24 +1083,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a6, a5, 35 @@ -1114,31 +1114,31 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 -; RV64I-NEXT: srl a4, a5, t0 +; RV64I-NEXT: srl a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB8_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -1322,24 +1322,24 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a6, a5, 37 @@ -1353,31 +1353,31 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: j .LBB9_3 ; RV64I-NEXT: .LBB9_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 -; RV64I-NEXT: srl a4, a5, t0 +; RV64I-NEXT: srl a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB9_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -1538,24 +1538,24 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a7, a4, 35 @@ -1571,31 +1571,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) ; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: not a7, a3 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 -; RV64I-NEXT: sll a3, a4, a7 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB10_3: ; RV64I-NEXT: srli a3, a1, 56 @@ -1662,20 +1662,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or t3, t5, t4 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t4 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t1 -; RV32I-NEXT: mv t1, sp +; RV32I-NEXT: or a1, a1, t5 +; RV32I-NEXT: mv t4, sp ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or t2, a0, t2 @@ -1684,7 +1684,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a5, a7, a6 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or a6, t2, t0 -; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) @@ -1695,29 +1695,29 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli a0, a1, 3 ; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: add a1, t1, a1 +; RV32I-NEXT: add a1, t4, a1 ; RV32I-NEXT: andi a3, a0, 24 -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a1) -; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 0(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srl a7, a5, a0 -; RV32I-NEXT: slli t0, a6, 1 -; RV32I-NEXT: srl a4, a4, a0 -; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a7, a4, a0 +; RV32I-NEXT: slli t0, a5, 1 ; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a0 ; RV32I-NEXT: slli t1, a1, 1 ; RV32I-NEXT: sra a0, a1, a0 ; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a4, a4, a3 ; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 ; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a5, a4, a5 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -1725,18 +1725,18 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a3, 16 ; RV32I-NEXT: srli t0, a3, 24 ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: srli t1, a5, 16 -; RV32I-NEXT: srli t2, a5, 24 -; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: srli t1, a4, 16 +; RV32I-NEXT: srli t2, a4, 24 +; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: srli t3, a1, 16 ; RV32I-NEXT: srli t4, a1, 24 ; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sb a5, 8(a2) ; RV32I-NEXT: sb a3, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) ; RV32I-NEXT: sb t0, 11(a2) -; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb a6, 0(a2) +; RV32I-NEXT: sb a4, 1(a2) ; RV32I-NEXT: sb t1, 2(a2) ; RV32I-NEXT: sb t2, 3(a2) ; RV32I-NEXT: sb a7, 4(a2) @@ -1778,24 +1778,24 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t2, 1(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 1(a1) +; RV64I-NEXT: lbu t2, 0(a1) ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a7, a4, 37 @@ -1811,31 +1811,31 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: j .LBB11_3 ; RV64I-NEXT: .LBB11_2: ; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) ; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: not a7, a3 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 -; RV64I-NEXT: sll a3, a4, a7 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB11_3: ; RV64I-NEXT: srli a3, a1, 56 @@ -2061,17 +2061,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) @@ -2088,8 +2088,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2108,22 +2108,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: add a1, s6, a1 ; RV64I-NEXT: andi a0, a4, 56 -; RV64I-NEXT: ld a3, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: xori a5, a0, 63 +; RV64I-NEXT: ld a3, 8(a1) ; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a7, 0(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a4 +; RV64I-NEXT: srl a0, a3, a4 ; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a3, a4 -; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a1, a7, a4 +; RV64I-NEXT: slli a7, a3, 1 ; RV64I-NEXT: srl a3, a6, a4 ; RV64I-NEXT: slli a6, t0, 1 ; RV64I-NEXT: srl t0, t0, a4 -; RV64I-NEXT: sll a4, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, t0, 56 +; RV64I-NEXT: sll a4, t1, a5 +; RV64I-NEXT: sll a7, a7, a5 +; RV64I-NEXT: sll a5, a6, a5 +; RV64I-NEXT: srli a6, t0, 56 ; RV64I-NEXT: srli t1, t0, 48 ; RV64I-NEXT: srli t2, t0, 40 ; RV64I-NEXT: srli t3, t0, 32 @@ -2131,40 +2131,40 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t0, 16 ; RV64I-NEXT: srli t6, t0, 8 ; RV64I-NEXT: or a4, a0, a4 -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: or a6, a3, a6 +; RV64I-NEXT: or a7, a1, a7 +; RV64I-NEXT: or a5, a3, a5 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) -; RV64I-NEXT: sb a7, 31(a2) +; RV64I-NEXT: sb a6, 31(a2) ; RV64I-NEXT: sb t0, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) -; RV64I-NEXT: srli a7, a6, 56 -; RV64I-NEXT: srli t0, a6, 48 -; RV64I-NEXT: srli t1, a6, 40 -; RV64I-NEXT: srli t2, a6, 32 -; RV64I-NEXT: srli t3, a6, 24 -; RV64I-NEXT: srli t4, a6, 16 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: srli t5, a5, 56 -; RV64I-NEXT: srli t6, a5, 48 -; RV64I-NEXT: srli s0, a5, 40 -; RV64I-NEXT: srli s1, a5, 32 -; RV64I-NEXT: srli s2, a5, 24 -; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: srli t0, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli t2, a5, 32 +; RV64I-NEXT: srli t3, a5, 24 +; RV64I-NEXT: srli t4, a5, 16 ; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli t5, a7, 56 +; RV64I-NEXT: srli t6, a7, 48 +; RV64I-NEXT: srli s0, a7, 40 +; RV64I-NEXT: srli s1, a7, 32 +; RV64I-NEXT: srli s2, a7, 24 +; RV64I-NEXT: srli s3, a7, 16 +; RV64I-NEXT: srli a7, a7, 8 ; RV64I-NEXT: srli s4, a4, 56 ; RV64I-NEXT: srli s5, a4, 48 ; RV64I-NEXT: srli s6, a4, 40 ; RV64I-NEXT: sb t2, 20(a2) ; RV64I-NEXT: sb t1, 21(a2) ; RV64I-NEXT: sb t0, 22(a2) -; RV64I-NEXT: sb a7, 23(a2) -; RV64I-NEXT: srli a7, a4, 32 +; RV64I-NEXT: sb a6, 23(a2) +; RV64I-NEXT: srli a6, a4, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb a5, 17(a2) ; RV64I-NEXT: sb t4, 18(a2) ; RV64I-NEXT: sb t3, 19(a2) ; RV64I-NEXT: srli a3, a4, 24 @@ -2172,19 +2172,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s0, 5(a2) ; RV64I-NEXT: sb t6, 6(a2) ; RV64I-NEXT: sb t5, 7(a2) -; RV64I-NEXT: srli a6, a4, 16 +; RV64I-NEXT: srli a5, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a7, 1(a2) ; RV64I-NEXT: sb s3, 2(a2) ; RV64I-NEXT: sb s2, 3(a2) -; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb s6, 13(a2) ; RV64I-NEXT: sb s5, 14(a2) ; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -2539,17 +2539,17 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) @@ -2566,8 +2566,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2587,24 +2587,24 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: andi a0, a3, 32 ; RV64I-NEXT: add a1, s6, a1 -; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: xori a4, a0, 63 ; RV64I-NEXT: ld a5, 8(a1) ; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a7, 0(a1) ; RV64I-NEXT: ld t0, 24(a1) ; RV64I-NEXT: srl a0, a5, a3 ; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a4, a3 +; RV64I-NEXT: srl a1, a7, a3 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl a6, a6, a3 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: srl a3, t0, a3 -; RV64I-NEXT: sll t0, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, a4, 24 -; RV64I-NEXT: srli t1, a4, 16 -; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: sll t0, t1, a4 +; RV64I-NEXT: sll a5, a5, a4 +; RV64I-NEXT: sll a4, a7, a4 +; RV64I-NEXT: srli a7, a6, 24 +; RV64I-NEXT: srli t1, a6, 16 +; RV64I-NEXT: srli t2, a6, 8 ; RV64I-NEXT: srli t3, a3, 56 ; RV64I-NEXT: srli t4, a3, 48 ; RV64I-NEXT: srli t5, a3, 40 @@ -2616,12 +2616,12 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: srli s4, a1, 16 ; RV64I-NEXT: srli s5, a1, 8 ; RV64I-NEXT: srli s6, a0, 24 -; RV64I-NEXT: or a6, a4, a6 -; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: sb a6, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: srli a6, a0, 16 ; RV64I-NEXT: sb t6, 28(a2) ; RV64I-NEXT: sb t5, 29(a2) ; RV64I-NEXT: sb t4, 30(a2) @@ -2639,12 +2639,12 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb s3, 3(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a6, 10(a2) ; RV64I-NEXT: sb s6, 11(a2) -; RV64I-NEXT: srli a0, a6, 56 -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: srli a3, a6, 40 -; RV64I-NEXT: srli a4, a6, 32 +; RV64I-NEXT: srli a0, a4, 56 +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: srli a4, a4, 32 ; RV64I-NEXT: srli a6, a5, 56 ; RV64I-NEXT: srli a7, a5, 48 ; RV64I-NEXT: srli t1, a5, 40 @@ -2797,13 +2797,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw a1, 0(t6) ; RV32I-NEXT: lw a0, 4(t6) ; RV32I-NEXT: lw a4, 8(t6) ; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a6, 16(t6) ; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -3197,13 +3197,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw a1, 0(t6) ; RV32I-NEXT: lw a0, 4(t6) ; RV32I-NEXT: lw a4, 8(t6) ; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw a7, 24(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a6, 16(t6) ; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -3376,17 +3376,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) @@ -3403,8 +3403,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -3854,17 +3854,17 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) @@ -3881,8 +3881,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -4112,13 +4112,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sw t0, 44(sp) ; RV32I-NEXT: sw t1, 48(sp) ; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw a1, 0(t2) ; RV32I-NEXT: lw a0, 4(t2) ; RV32I-NEXT: lw a4, 8(t2) ; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw a7, 24(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a6, 16(t2) ; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -4512,13 +4512,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sw t0, 44(sp) ; RV32I-NEXT: sw t1, 48(sp) ; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw a1, 0(t2) ; RV32I-NEXT: lw a0, 4(t2) ; RV32I-NEXT: lw a4, 8(t2) ; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw a7, 24(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a6, 16(t2) ; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -4691,17 +4691,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -4714,8 +4714,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -4739,22 +4739,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: add a1, s6, a1 ; RV64I-NEXT: andi a0, a4, 56 -; RV64I-NEXT: ld a3, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) +; RV64I-NEXT: xori a5, a0, 63 +; RV64I-NEXT: ld a3, 8(a1) ; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a7, 0(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a4 +; RV64I-NEXT: srl a0, a3, a4 ; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a3, a4 -; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a1, a7, a4 +; RV64I-NEXT: slli a7, a3, 1 ; RV64I-NEXT: srl a3, a6, a4 ; RV64I-NEXT: slli a6, t0, 1 ; RV64I-NEXT: sra t0, t0, a4 -; RV64I-NEXT: sll a4, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, t0, 56 +; RV64I-NEXT: sll a4, t1, a5 +; RV64I-NEXT: sll a7, a7, a5 +; RV64I-NEXT: sll a5, a6, a5 +; RV64I-NEXT: srli a6, t0, 56 ; RV64I-NEXT: srli t1, t0, 48 ; RV64I-NEXT: srli t2, t0, 40 ; RV64I-NEXT: srli t3, t0, 32 @@ -4762,40 +4762,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t0, 16 ; RV64I-NEXT: srli t6, t0, 8 ; RV64I-NEXT: or a4, a0, a4 -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: or a6, a3, a6 +; RV64I-NEXT: or a7, a1, a7 +; RV64I-NEXT: or a5, a3, a5 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) -; RV64I-NEXT: sb a7, 31(a2) +; RV64I-NEXT: sb a6, 31(a2) ; RV64I-NEXT: sb t0, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) -; RV64I-NEXT: srli a7, a6, 56 -; RV64I-NEXT: srli t0, a6, 48 -; RV64I-NEXT: srli t1, a6, 40 -; RV64I-NEXT: srli t2, a6, 32 -; RV64I-NEXT: srli t3, a6, 24 -; RV64I-NEXT: srli t4, a6, 16 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: srli t5, a5, 56 -; RV64I-NEXT: srli t6, a5, 48 -; RV64I-NEXT: srli s0, a5, 40 -; RV64I-NEXT: srli s1, a5, 32 -; RV64I-NEXT: srli s2, a5, 24 -; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: srli t0, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli t2, a5, 32 +; RV64I-NEXT: srli t3, a5, 24 +; RV64I-NEXT: srli t4, a5, 16 ; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli t5, a7, 56 +; RV64I-NEXT: srli t6, a7, 48 +; RV64I-NEXT: srli s0, a7, 40 +; RV64I-NEXT: srli s1, a7, 32 +; RV64I-NEXT: srli s2, a7, 24 +; RV64I-NEXT: srli s3, a7, 16 +; RV64I-NEXT: srli a7, a7, 8 ; RV64I-NEXT: srli s4, a4, 56 ; RV64I-NEXT: srli s5, a4, 48 ; RV64I-NEXT: srli s6, a4, 40 ; RV64I-NEXT: sb t2, 20(a2) ; RV64I-NEXT: sb t1, 21(a2) ; RV64I-NEXT: sb t0, 22(a2) -; RV64I-NEXT: sb a7, 23(a2) -; RV64I-NEXT: srli a7, a4, 32 +; RV64I-NEXT: sb a6, 23(a2) +; RV64I-NEXT: srli a6, a4, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb a5, 17(a2) ; RV64I-NEXT: sb t4, 18(a2) ; RV64I-NEXT: sb t3, 19(a2) ; RV64I-NEXT: srli a3, a4, 24 @@ -4803,19 +4803,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s0, 5(a2) ; RV64I-NEXT: sb t6, 6(a2) ; RV64I-NEXT: sb t5, 7(a2) -; RV64I-NEXT: srli a6, a4, 16 +; RV64I-NEXT: srli a5, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a7, 1(a2) ; RV64I-NEXT: sb s3, 2(a2) ; RV64I-NEXT: sb s2, 3(a2) -; RV64I-NEXT: sb a7, 12(a2) +; RV64I-NEXT: sb a6, 12(a2) ; RV64I-NEXT: sb s6, 13(a2) ; RV64I-NEXT: sb s5, 14(a2) ; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -5171,17 +5171,17 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -5194,8 +5194,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -5220,24 +5220,24 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: andi a0, a3, 32 ; RV64I-NEXT: add a1, s6, a1 -; RV64I-NEXT: ld a4, 0(a1) +; RV64I-NEXT: xori a4, a0, 63 ; RV64I-NEXT: ld a5, 8(a1) ; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a7, 0(a1) ; RV64I-NEXT: ld t0, 24(a1) ; RV64I-NEXT: srl a0, a5, a3 ; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a4, a3 +; RV64I-NEXT: srl a1, a7, a3 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl a6, a6, a3 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: sra a3, t0, a3 -; RV64I-NEXT: sll t0, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, a4, 24 -; RV64I-NEXT: srli t1, a4, 16 -; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: sll t0, t1, a4 +; RV64I-NEXT: sll a5, a5, a4 +; RV64I-NEXT: sll a4, a7, a4 +; RV64I-NEXT: srli a7, a6, 24 +; RV64I-NEXT: srli t1, a6, 16 +; RV64I-NEXT: srli t2, a6, 8 ; RV64I-NEXT: srli t3, a3, 56 ; RV64I-NEXT: srli t4, a3, 48 ; RV64I-NEXT: srli t5, a3, 40 @@ -5249,12 +5249,12 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: srli s4, a1, 16 ; RV64I-NEXT: srli s5, a1, 8 ; RV64I-NEXT: srli s6, a0, 24 -; RV64I-NEXT: or a6, a4, a6 -; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: sb a6, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: srli a6, a0, 16 ; RV64I-NEXT: sb t6, 28(a2) ; RV64I-NEXT: sb t5, 29(a2) ; RV64I-NEXT: sb t4, 30(a2) @@ -5272,12 +5272,12 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb s3, 3(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a6, 10(a2) ; RV64I-NEXT: sb s6, 11(a2) -; RV64I-NEXT: srli a0, a6, 56 -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: srli a3, a6, 40 -; RV64I-NEXT: srli a4, a6, 32 +; RV64I-NEXT: srli a0, a4, 56 +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: srli a4, a4, 32 ; RV64I-NEXT: srli a6, a5, 56 ; RV64I-NEXT: srli a7, a5, 48 ; RV64I-NEXT: srli t1, a5, 40 @@ -5431,13 +5431,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw a1, 0(s6) ; RV32I-NEXT: lw a0, 4(s6) ; RV32I-NEXT: lw a4, 8(s6) ; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw a7, 24(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a6, 16(s6) ; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -5833,13 +5833,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw a1, 0(s6) ; RV32I-NEXT: lw a0, 4(s6) ; RV32I-NEXT: lw a4, 8(s6) ; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw a7, 24(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a6, 16(s6) ; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index 78f63c72d0469..cd7f30d8f5898 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -29,25 +29,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -90,25 +90,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -151,25 +151,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: lbu a5, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a3, 1(a1) +; RV32I-NEXT: lbu a5, 0(a1) +; RV32I-NEXT: lbu a6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -211,24 +211,24 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -254,25 +254,25 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: lbu a4, 6(a0) +; RV32I-NEXT: lbu a5, 7(a0) +; RV32I-NEXT: lbu a6, 4(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: or a5, a5, a3 -; RV32I-NEXT: or a4, a1, a4 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 ; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: srl a1, a5, a4 ; RV32I-NEXT: bltz a3, .LBB3_2 @@ -344,24 +344,24 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -387,25 +387,25 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: lbu a5, 2(a0) -; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: or a5, a5, a3 -; RV32I-NEXT: or a4, a1, a4 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a4, a3 +; RV32I-NEXT: or a4, a1, a6 ; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: sll a1, a5, a4 ; RV32I-NEXT: bltz a3, .LBB4_2 @@ -477,24 +477,24 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t1, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t0 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -519,21 +519,21 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a3, 5(a0) +; RV32I-NEXT: lbu a4, 4(a0) ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 2(a1) +; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a7, a4, a7 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: slli a4, a5, 16 ; RV32I-NEXT: slli a5, a6, 24 ; RV32I-NEXT: or a4, a5, a4 @@ -611,24 +611,24 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t2, 4(a1) ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a4, a3 @@ -641,31 +641,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 0(a0) -; RV64I-NEXT: lbu t0, 2(a0) -; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu t1, 0(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 4(a0) +; RV64I-NEXT: lbu t2, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 -; RV64I-NEXT: sll a4, a5, t0 +; RV64I-NEXT: sll a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB6_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -736,24 +736,24 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t1, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 ; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 2(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t2, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: mv t2, sp ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, t0, a7 @@ -767,28 +767,28 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a1, 3 ; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: add a0, t2, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a6, 8(a0) ; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: add a0, t2, a0 +; RV32I-NEXT: lw a4, 4(a0) +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a6, 0(a0) ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a7, a5, a1 -; RV32I-NEXT: slli t0, a6, 1 -; RV32I-NEXT: srl a4, a4, a1 -; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a7, a4, a1 +; RV32I-NEXT: slli t0, a5, 1 ; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a1 ; RV32I-NEXT: slli t1, a0, 1 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a4, a4, a3 ; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 ; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -847,24 +847,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t2, 4(a1) ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a4, a3 @@ -877,31 +877,31 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 8(a0) -; RV64I-NEXT: lbu t0, 10(a0) -; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: lbu a7, 10(a0) +; RV64I-NEXT: lbu t0, 11(a0) +; RV64I-NEXT: lbu t1, 8(a0) ; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: lbu a7, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 13(a0) +; RV64I-NEXT: lbu t1, 12(a0) +; RV64I-NEXT: lbu t2, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: not t0, a4 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 -; RV64I-NEXT: srl a4, a5, t0 +; RV64I-NEXT: srl a4, a5, a7 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: .LBB7_3: ; RV64I-NEXT: srai a3, a3, 63 @@ -972,24 +972,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t1, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 ; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 2(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t2, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: addi t2, sp, 16 ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, t0, a7 @@ -1083,24 +1083,24 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or t1, t2, t1 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: lbu t2, 5(a1) ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t2, 4(a1) ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t2 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a5, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a4, a4, a3 @@ -1115,31 +1115,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 0(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu t0, 0(a0) ; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: lbu a6, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a5, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a6, t1, a6 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: not a7, a3 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 -; RV64I-NEXT: sll a3, a4, a7 +; RV64I-NEXT: sll a3, a4, a6 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: .LBB8_3: ; RV64I-NEXT: srli a3, a1, 56 @@ -1206,29 +1206,29 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or t3, t5, t4 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t4 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: or a1, a1, t5 ; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: mv a5, sp ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t1, a0, t2 +; RV32I-NEXT: or t2, a0, t2 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: or a4, t3, a4 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a7, t2, t0 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) @@ -1240,28 +1240,28 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a1, 3 ; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: add a0, a5, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a6, 8(a0) ; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: add a0, a5, a0 +; RV32I-NEXT: lw a4, 4(a0) +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a6, 0(a0) ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a7, a5, a1 -; RV32I-NEXT: slli t0, a6, 1 -; RV32I-NEXT: srl a4, a4, a1 -; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl a7, a4, a1 +; RV32I-NEXT: slli t0, a5, 1 ; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a1 ; RV32I-NEXT: slli t1, a0, 1 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll a4, a4, a3 ; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 ; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -1388,17 +1388,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -1415,8 +1415,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t1, s0, t6 ; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a3, a3, 32 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -1434,23 +1434,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a0, a1, 3 ; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: add a0, a6, a0 -; RV64I-NEXT: ld a4, 0(a0) -; RV64I-NEXT: ld a5, 8(a0) -; RV64I-NEXT: ld a6, 16(a0) ; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: add a0, a6, a0 +; RV64I-NEXT: ld a4, 8(a0) +; RV64I-NEXT: ld a5, 16(a0) +; RV64I-NEXT: ld a6, 0(a0) ; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: srl a7, a5, a1 -; RV64I-NEXT: slli t0, a6, 1 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a7, a4, a1 +; RV64I-NEXT: slli t0, a5, 1 ; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: srl a5, a5, a1 ; RV64I-NEXT: slli t1, a0, 1 ; RV64I-NEXT: srl t2, a0, a1 ; RV64I-NEXT: sll a0, t0, a3 -; RV64I-NEXT: sll a1, a5, a3 +; RV64I-NEXT: sll a1, a4, a3 ; RV64I-NEXT: sll a3, t1, a3 -; RV64I-NEXT: srli a5, t2, 56 +; RV64I-NEXT: srli a4, t2, 56 ; RV64I-NEXT: srli t0, t2, 48 ; RV64I-NEXT: srli t1, t2, 40 ; RV64I-NEXT: srli t3, t2, 32 @@ -1458,12 +1458,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t2, 16 ; RV64I-NEXT: srli t6, t2, 8 ; RV64I-NEXT: or a0, a7, a0 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a1, a6, a1 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t1, 29(a2) ; RV64I-NEXT: sb t0, 30(a2) -; RV64I-NEXT: sb a5, 31(a2) +; RV64I-NEXT: sb a4, 31(a2) ; RV64I-NEXT: sb t2, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) @@ -1864,17 +1864,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -1891,8 +1891,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t1, s0, t6 ; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a3, a3, 32 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2340,17 +2340,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 ; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) -; RV64I-NEXT: lbu s6, 5(a1) ; RV64I-NEXT: slli s4, s4, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s4, s7, s4 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -2363,8 +2363,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2387,23 +2387,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a0, a1, 3 ; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: add a0, s6, a0 -; RV64I-NEXT: ld a4, 0(a0) -; RV64I-NEXT: ld a5, 8(a0) -; RV64I-NEXT: ld a6, 16(a0) ; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: add a0, s6, a0 +; RV64I-NEXT: ld a4, 8(a0) +; RV64I-NEXT: ld a5, 16(a0) +; RV64I-NEXT: ld a6, 0(a0) ; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: srl a7, a5, a1 -; RV64I-NEXT: slli t0, a6, 1 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: slli a5, a5, 1 +; RV64I-NEXT: srl a7, a4, a1 +; RV64I-NEXT: slli t0, a5, 1 ; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: srl a5, a5, a1 ; RV64I-NEXT: slli t1, a0, 1 ; RV64I-NEXT: sra t2, a0, a1 ; RV64I-NEXT: sll a0, t0, a3 -; RV64I-NEXT: sll a1, a5, a3 +; RV64I-NEXT: sll a1, a4, a3 ; RV64I-NEXT: sll a3, t1, a3 -; RV64I-NEXT: srli a5, t2, 56 +; RV64I-NEXT: srli a4, t2, 56 ; RV64I-NEXT: srli t0, t2, 48 ; RV64I-NEXT: srli t1, t2, 40 ; RV64I-NEXT: srli t3, t2, 32 @@ -2411,12 +2411,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t2, 16 ; RV64I-NEXT: srli t6, t2, 8 ; RV64I-NEXT: or a0, a7, a0 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a1, a6, a1 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t1, 29(a2) ; RV64I-NEXT: sb t0, 30(a2) -; RV64I-NEXT: sb a5, 31(a2) +; RV64I-NEXT: sb a4, 31(a2) ; RV64I-NEXT: sb t2, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll index 3525c40026064..4df61dad7d039 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll @@ -57,14 +57,14 @@ define i64 @lwud(ptr %a) { define i64 @ldd(ptr %a) { ; RV32XTHEADMEMPAIR-LABEL: ldd: ; RV32XTHEADMEMPAIR: # %bb.0: -; RV32XTHEADMEMPAIR-NEXT: lw a1, 44(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a2, 32(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a3, 36(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a1, 32(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a2, 36(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a3, 44(a0) ; RV32XTHEADMEMPAIR-NEXT: lw a0, 40(a0) -; RV32XTHEADMEMPAIR-NEXT: add a1, a3, a1 -; RV32XTHEADMEMPAIR-NEXT: add a0, a2, a0 -; RV32XTHEADMEMPAIR-NEXT: sltu a2, a0, a2 -; RV32XTHEADMEMPAIR-NEXT: add a1, a1, a2 +; RV32XTHEADMEMPAIR-NEXT: add a2, a2, a3 +; RV32XTHEADMEMPAIR-NEXT: add a0, a1, a0 +; RV32XTHEADMEMPAIR-NEXT: sltu a1, a0, a1 +; RV32XTHEADMEMPAIR-NEXT: add a1, a2, a1 ; RV32XTHEADMEMPAIR-NEXT: ret ; ; RV64XTHEADMEMPAIR-LABEL: ldd: diff --git a/llvm/test/CodeGen/RISCV/zilsd.ll b/llvm/test/CodeGen/RISCV/zilsd.ll index 6a4578bb02d8d..09b065a8e9ddd 100644 --- a/llvm/test/CodeGen/RISCV/zilsd.ll +++ b/llvm/test/CodeGen/RISCV/zilsd.ll @@ -36,25 +36,25 @@ define i64 @load_unaligned(ptr %p) { ; SLOW-LABEL: load_unaligned: ; SLOW: # %bb.0: ; SLOW-NEXT: lbu a1, 1(a0) -; SLOW-NEXT: lbu a2, 0(a0) -; SLOW-NEXT: lbu a3, 2(a0) -; SLOW-NEXT: lbu a4, 3(a0) +; SLOW-NEXT: lbu a2, 2(a0) +; SLOW-NEXT: lbu a3, 3(a0) +; SLOW-NEXT: lbu a4, 0(a0) ; SLOW-NEXT: slli a1, a1, 8 -; SLOW-NEXT: or a1, a1, a2 -; SLOW-NEXT: lbu a2, 4(a0) -; SLOW-NEXT: lbu a5, 5(a0) -; SLOW-NEXT: slli a3, a3, 16 -; SLOW-NEXT: slli a4, a4, 24 -; SLOW-NEXT: or a3, a4, a3 -; SLOW-NEXT: lbu a4, 6(a0) +; SLOW-NEXT: slli a2, a2, 16 +; SLOW-NEXT: slli a3, a3, 24 +; SLOW-NEXT: or a1, a1, a4 +; SLOW-NEXT: or a2, a3, a2 +; SLOW-NEXT: lbu a3, 5(a0) +; SLOW-NEXT: lbu a4, 4(a0) +; SLOW-NEXT: lbu a5, 6(a0) ; SLOW-NEXT: lbu a0, 7(a0) -; SLOW-NEXT: slli a5, a5, 8 -; SLOW-NEXT: or a2, a5, a2 -; SLOW-NEXT: slli a4, a4, 16 +; SLOW-NEXT: slli a3, a3, 8 +; SLOW-NEXT: or a3, a3, a4 +; SLOW-NEXT: slli a5, a5, 16 ; SLOW-NEXT: slli a0, a0, 24 -; SLOW-NEXT: or a4, a0, a4 -; SLOW-NEXT: or a0, a3, a1 -; SLOW-NEXT: or a1, a4, a2 +; SLOW-NEXT: or a5, a0, a5 +; SLOW-NEXT: or a0, a2, a1 +; SLOW-NEXT: or a1, a5, a3 ; SLOW-NEXT: ret ; ; FAST-LABEL: load_unaligned: diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index 93203ef6e17f5..c12d8135e5eba 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -25,20 +25,18 @@ define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind { ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: addl (%rdi), %eax ; X64-NEXT: addl (%rdi,%rcx), %eax -; X64-NEXT: leaq (%rdi,%rcx), %r8 -; X64-NEXT: addl (%rcx,%r8), %eax -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: addl (%rcx,%r8), %eax -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq %r8, %rdi -; X64-NEXT: cmpq %rsi, %r8 +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: addl (%rcx,%rdi), %eax +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: addl (%rcx,%rdi), %eax +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: cmpq %rsi, %rdi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: retq ; ; X32-LABEL: simple: ; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -52,18 +50,16 @@ define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind { ; X32-NEXT: # =>This Inner Loop Header: Depth=1 ; X32-NEXT: addl (%edi), %eax ; X32-NEXT: addl (%edi,%edx), %eax -; X32-NEXT: leal (%edi,%edx), %ebx -; X32-NEXT: addl (%edx,%ebx), %eax -; X32-NEXT: addl %edx, %ebx -; X32-NEXT: addl (%edx,%ebx), %eax -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movl %ebx, %edi -; X32-NEXT: cmpl %ecx, %ebx +; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl (%edx,%edi), %eax +; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl (%edx,%edi), %eax +; X32-NEXT: addl %esi, %edi +; X32-NEXT: cmpl %ecx, %edi ; X32-NEXT: jne .LBB0_1 ; X32-NEXT: # %bb.2: # %exit ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi -; X32-NEXT: popl %ebx ; X32-NEXT: retl entry: br label %loop