diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5f1983791cfae..6205495e8e22b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "SIModeRegisterDefaults.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -26,6 +27,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" @@ -109,6 +111,7 @@ class AMDGPUCodeGenPrepareImpl bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; mutable Function *LdexpF32 = nullptr; + mutable SmallVector DeadVals; DenseMap BreakPhiNodesCache; @@ -285,28 +288,18 @@ bool AMDGPUCodeGenPrepareImpl::run() { BreakPhiNodesCache.clear(); bool MadeChange = false; - Function::iterator NextBB; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { - BasicBlock *BB = &*FI; - NextBB = std::next(FI); - - BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - I = Next) { - Next = std::next(I); - - MadeChange |= visit(*I); - - if (Next != E) { // Control flow changed - BasicBlock *NextInstBB = Next->getParent(); - if (NextInstBB != BB) { - BB = NextInstBB; - E = BB->end(); - FE = F.end(); - } - } + for (BasicBlock &BB : reverse(F)) { + for (Instruction &I : make_early_inc_range(reverse(BB))) { + if (!isInstructionTriviallyDead(&I, TLI)) + MadeChange |= visit(I); } } + + while (!DeadVals.empty()) { + if (auto *I = dyn_cast_or_null(DeadVals.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + return MadeChange; } @@ -426,7 +419,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { Value *NewVal = insertValues(Builder, Ty, ResultVals); NewVal->takeName(&I); I.replaceAllUsesWith(NewVal); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -500,10 +493,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { FoldedT, FoldedF); NewSelect->takeName(&BO); BO.replaceAllUsesWith(NewSelect); - BO.eraseFromParent(); + DeadVals.push_back(&BO); if (CastOp) - CastOp->eraseFromParent(); - Sel->eraseFromParent(); + DeadVals.push_back(CastOp); + DeadVals.push_back(Sel); return true; } @@ -900,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { if (NewVal) { FDiv.replaceAllUsesWith(NewVal); NewVal->takeName(&FDiv); - RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI); + DeadVals.push_back(&FDiv); } return true; @@ -1310,7 +1303,8 @@ within the byte are all 0. static bool tryNarrowMathIfNoOverflow(Instruction *I, const SITargetLowering *TLI, const TargetTransformInfo &TTI, - const DataLayout &DL) { + const DataLayout &DL, + SmallVector &DeadVals) { unsigned Opc = I->getOpcode(); Type *OldType = I->getType(); @@ -1365,7 +1359,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, Value *Zext = Builder.CreateZExt(Arith, OldType); I->replaceAllUsesWith(Zext); - I->eraseFromParent(); + DeadVals.push_back(I); return true; } @@ -1376,7 +1370,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (UseMul24Intrin && replaceMulWithMul24(I)) return true; if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), - TM.getTargetTransformInfo(F), DL)) + TM.getTargetTransformInfo(F), DL, DeadVals)) return true; bool Changed = false; @@ -1441,7 +1435,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (NewDiv) { I.replaceAllUsesWith(NewDiv); - I.eraseFromParent(); + DeadVals.push_back(&I); Changed = true; } } @@ -1497,7 +1491,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1539,7 +1533,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Fract->takeName(&I); I.replaceAllUsesWith(Fract); - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } @@ -1827,7 +1821,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { } I.replaceAllUsesWith(Vec); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -1908,7 +1902,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { auto *Intrin = B.CreateIntrinsic( I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)}); I.replaceAllUsesWith(Intrin); - I.eraseFromParent(); + DeadVals.push_back(&I); return true; } @@ -2005,16 +1999,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *Fract = applyFractPat(Builder, FractArg); Fract->takeName(&I); I.replaceAllUsesWith(Fract); - - RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); + DeadVals.push_back(&I); return true; } -static bool isOneOrNegOne(const Value *Val) { - const APFloat *C; - return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; -} - // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Type *Ty = Sqrt.getType()->getScalarType(); @@ -2035,18 +2023,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { if (ReqdAccuracy < 1.0f) return false; - // FIXME: This is an ugly hack for this pass using forward iteration instead - // of reverse. If it worked like a normal combiner, the rsq would form before - // we saw a sqrt call. - auto *FDiv = - dyn_cast_or_null(Sqrt.getUniqueUndroppableUser()); - if (FDiv && FDiv->getOpcode() == Instruction::FDiv && - FDiv->getFPAccuracy() >= 1.0f && - canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && - // TODO: We should also handle the arcp case for the fdiv with non-1 value - isOneOrNegOne(FDiv->getOperand(0))) - return false; - Value *SrcVal = Sqrt.getOperand(0); bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); @@ -2070,7 +2046,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); NewSqrt->takeName(&Sqrt); Sqrt.replaceAllUsesWith(NewSqrt); - Sqrt.eraseFromParent(); + DeadVals.push_back(&Sqrt); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll index c94b33334646d..1f36902762f0b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll @@ -726,16 +726,16 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE815:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE016:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE117:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE016]], double [[TMP11]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE218:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE117]], double [[TMP12]], i64 2 @@ -746,8 +746,8 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4 -; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1 ; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE420]], ptr [[OUT]], align 1 +; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -1187,11 +1187,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE960:%.*]] = extractelement <5 x double> [[IN]], i64 4 ; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[COND5_END]] ; CHECK: cond5.end: -; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE046:%.*]] = insertelement <5 x double> poison, double [[TMP25]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE147:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE046]], double [[TMP26]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE248:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE147]], double [[TMP27]], i64 2 @@ -1204,11 +1204,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE859:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE450]], i64 4 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE061:%.*]] = insertelement <5 x double> poison, double [[TMP30]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE162:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE061]], double [[TMP31]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE263:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE162]], double [[TMP32]], i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index a4f9ce3e7350a..7ff86ac152feb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -2160,7 +2160,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 ; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]]) +; IEEE-GOODFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]]) +; IEEE-GOODFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]]) +; IEEE-GOODFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]]) +; IEEE-GOODFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]]) +; IEEE-GOODFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]]) +; IEEE-GOODFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0 +; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2231,7 +2246,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 ; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]]) +; IEEE-BADFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]]) +; IEEE-BADFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]]) +; IEEE-BADFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]]) +; IEEE-BADFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]]) +; IEEE-BADFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]]) +; IEEE-BADFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0 +; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2258,7 +2288,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) ; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> splat (float 1.000000e+00), [[SQRT_X_NO_MD]] ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2:![0-9]+]] +; DAZ-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP41:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP39]]) +; DAZ-NEXT: [[TMP42:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP40]]) +; DAZ-NEXT: [[TMP43:%.*]] = insertelement <2 x float> poison, float [[TMP41]], i64 0 +; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = insertelement <2 x float> [[TMP43]], float [[TMP42]], i64 1 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 ; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2276,7 +2311,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1 ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP44]]) ; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) ; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 ; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 @@ -2290,7 +2327,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 ; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP25]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; DAZ-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP36:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP34]]) +; DAZ-NEXT: [[TMP37:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP35]]) +; DAZ-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP36]], i64 0 +; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -3200,9 +3242,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) { ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP17:%.*]] = fneg contract float [[TMP13]] -; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]]) +; DAZ-NEXT: [[TMP45:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP45]] ; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) ; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 ; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 @@ -3675,9 +3721,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP17:%.*]] = fneg arcp contract float [[TMP13]] -; DAZ-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) +; DAZ-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP18:%.*]] = fneg arcp contract float [[TMP29]] ; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) ; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]] ; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]]) @@ -3850,19 +3900,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP50]] -; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]]) -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]] ; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 @@ -3903,19 +3943,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] -; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]] ; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 ; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) @@ -3956,9 +3986,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; DAZ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; DAZ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; DAZ-NEXT: [[TMP13:%.*]] = fneg contract float [[TMP9]] -; DAZ-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; DAZ-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; DAZ-NEXT: [[TMP14:%.*]] = fneg contract float [[TMP13]] ; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 ; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll index 3983655285e57..38239c5509318 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1634,29 +1634,18 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: ; IR-IEEE-SDAG: ; %bb.0: ; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 -; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 -; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 -; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 -; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2 ; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 ; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 ; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 ; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 @@ -1668,24 +1657,14 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: ; IR-IEEE-GISEL: ; %bb.0: ; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 -; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 -; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 -; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 -; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 -; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] -; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; IR-IEEE-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -1705,75 +1684,24 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-DAZ-SDAG: ; %bb.0: -; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1 -; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-DAZ-GISEL: ; %bb.0: -; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; IR-DAZ-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 -; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0 ret float %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0d5e498..b52913fbb969f 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -20,34 +20,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 +; GFX90A-NEXT: s_cselect_b32 s7, s3, s2 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 +; GFX90A-NEXT: s_cselect_b32 s7, s4, s7 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 +; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 ; GFX90A-NEXT: s_or_b32 s7, s7, s0 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 +; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 ; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 +; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 ; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 +; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 -; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 +; GFX90A-NEXT: s_cselect_b32 s2, s7, s2 ; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] ; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 +; GFX90A-NEXT: s_cselect_b32 s6, 0, s6 ; GFX90A-NEXT: s_mov_b64 vcc, vcc ; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -68,34 +68,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s4, s3 +; GFX942-NEXT: s_cselect_b32 s7, s3, s2 ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s5, s7 +; GFX942-NEXT: s_cselect_b32 s7, s4, s7 ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s6, s7 +; GFX942-NEXT: s_cselect_b32 s7, s5, s7 ; GFX942-NEXT: s_or_b32 s7, s7, s0 ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s4, s7, s4 +; GFX942-NEXT: s_cselect_b32 s3, s7, s3 ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 ; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX942-NEXT: s_cselect_b32 s6, s7, s6 +; GFX942-NEXT: s_cselect_b32 s5, s7, s5 ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 ; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX942-NEXT: s_cselect_b32 s5, s7, s5 +; GFX942-NEXT: s_cselect_b32 s4, s7, s4 ; GFX942-NEXT: s_cmp_eq_u32 s1, 0 -; GFX942-NEXT: s_cselect_b32 s3, s7, s3 +; GFX942-NEXT: s_cselect_b32 s2, s7, s2 ; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] ; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s2, 0, s2 +; GFX942-NEXT: s_cselect_b32 s6, 0, s6 ; GFX942-NEXT: s_mov_b64 vcc, vcc ; GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -117,34 +117,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1030-NEXT: s_cselect_b32 s7, s3, s2 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1030-NEXT: s_cselect_b32 s7, s4, s7 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 ; GFX1030-NEXT: s_or_b32 s7, s7, s0 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1030-NEXT: s_cselect_b32 s2, s7, s2 ; GFX1030-NEXT: s_or_b32 s7, s10, s8 ; GFX1030-NEXT: s_or_b32 s7, s9, s7 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1030-NEXT: s_cselect_b32 s6, 0, s6 ; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock ; GFX1030-NEXT: s_endpgm @@ -166,38 +166,38 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1100-NEXT: s_cselect_b32 s7, s3, s2 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1100-NEXT: s_cselect_b32 s7, s4, s7 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 ; GFX1100-NEXT: s_or_b32 s7, s7, s0 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1100-NEXT: s_cselect_b32 s2, s7, s2 ; GFX1100-NEXT: s_or_b32 s7, s10, s8 ; GFX1100-NEXT: s_or_b32 s7, s9, s7 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1100-NEXT: s_cselect_b32 s6, 0, s6 ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock ; GFX1100-NEXT: s_endpgm