From e6b8edc2236d781e0aa52f68cbead0c03685f07d Mon Sep 17 00:00:00 2001 From: Min Hsu Date: Thu, 23 May 2024 15:49:29 -0700 Subject: [PATCH 1/4] [AArch64][LoopIdiom] Generalize AArch64LoopIdiomTransform into LoopIdiomTransform To facilitate sharing LoopIdiomTransform between AArch64 and RISC-V, this patch first moves AArch64LoopIdiomTransform from lib/Target/AArch64 to lib/Transforms/Vectorize. In addition, key component that is subject to differ from RVV's vectorization style is factored out preemptively in this patch. --- .../Vectorize/LoopIdiomTransform.h} | 14 +- llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Target/AArch64/AArch64.h | 1 - .../Target/AArch64/AArch64PassRegistry.def | 20 - .../Target/AArch64/AArch64TargetMachine.cpp | 8 +- .../lib/Target/AArch64/AArch64TargetMachine.h | 1 - llvm/lib/Target/AArch64/CMakeLists.txt | 2 +- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/LoopIdiomTransform.cpp} | 439 ++++++++---------- .../LoopIdiom/AArch64/byte-compare-index.ll | 405 ++++++++-------- 11 files changed, 400 insertions(+), 493 deletions(-) rename llvm/{lib/Target/AArch64/AArch64LoopIdiomTransform.h => include/llvm/Transforms/Vectorize/LoopIdiomTransform.h} (60%) delete mode 100644 llvm/lib/Target/AArch64/AArch64PassRegistry.def rename llvm/lib/{Target/AArch64/AArch64LoopIdiomTransform.cpp => Transforms/Vectorize/LoopIdiomTransform.cpp} (71%) diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h similarity index 60% rename from llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h rename to llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h index cc68425bb68b5..a97dcc7ae3a3f 100644 --- a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h @@ -1,4 +1,4 @@ -//===- AArch64LoopIdiomTransform.h --------------------------------------===// +//===----------LoopIdiomTransform.h -----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,20 +6,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#ifndef LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H +#define LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H #include "llvm/IR/PassManager.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" namespace llvm { - -struct AArch64LoopIdiomTransformPass - : PassInfoMixin { +struct LoopIdiomTransformPass : PassInfoMixin { PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; - } // namespace llvm - -#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#endif // LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 316d05bf1dc37..76305606c74b7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -297,6 +297,7 @@ #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" +#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 50682ca4970f1..714058f91bfc6 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -621,6 +621,7 @@ LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("loop-bound-split", LoopBoundSplitPass()) LOOP_PASS("loop-deletion", LoopDeletionPass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) +LOOP_PASS("loop-idiom-transform", LoopIdiomTransformPass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-predication", LoopPredicationPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 0f0a22ec82936..6f2aeb83a451a 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -90,7 +90,6 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry &); void initializeAArch64GlobalsTaggingPass(PassRegistry &); void initializeAArch64LoadStoreOptPass(PassRegistry&); -void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64PassRegistry.def b/llvm/lib/Target/AArch64/AArch64PassRegistry.def deleted file mode 100644 index ca944579f93a9..0000000000000 --- a/llvm/lib/Target/AArch64/AArch64PassRegistry.def +++ /dev/null @@ -1,20 +0,0 @@ -//===- AArch64PassRegistry.def - Registry of AArch64 passes -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is used as the registry of passes that are part of the -// AArch64 backend. -// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -#ifndef LOOP_PASS -#define LOOP_PASS(NAME, CREATE_PASS) -#endif -LOOP_PASS("aarch64-lit", AArch64LoopIdiomTransformPass()) -#undef LOOP_PASS diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 30f0ceaf674c6..0d4050a8bdf2c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,7 +11,6 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" -#include "AArch64LoopIdiomTransform.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64MachineScheduler.h" #include "AArch64MacroFusion.h" @@ -52,6 +51,7 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/CFGuard.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" #include #include #include @@ -234,7 +234,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); - initializeAArch64LoopIdiomTransformLegacyPassPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); @@ -553,12 +552,9 @@ class AArch64PassConfig : public TargetPassConfig { void AArch64TargetMachine::registerPassBuilderCallbacks( PassBuilder &PB, bool PopulateClassToPassNames) { -#define GET_PASS_REGISTRY "AArch64PassRegistry.def" -#include "llvm/Passes/TargetPassRegistry.inc" - PB.registerLateLoopOptimizationsEPCallback( [=](LoopPassManager &LPM, OptimizationLevel Level) { - LPM.addPass(AArch64LoopIdiomTransformPass()); + LPM.addPass(LoopIdiomTransformPass()); }); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 8fb68b06f1378..e396d9204716a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -14,7 +14,6 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H #include "AArch64InstrInfo.h" -#include "AArch64LoopIdiomTransform.h" #include "AArch64Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 8e76f6c9279e7..639bc0707dff2 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -65,7 +65,6 @@ add_llvm_target(AArch64CodeGen AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp - AArch64LoopIdiomTransform.cpp AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MachineScheduler.cpp @@ -112,6 +111,7 @@ add_llvm_target(AArch64CodeGen Target TargetParser TransformUtils + Vectorize ADD_TO_COMPONENT AArch64 diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 9674094024b9e..3ca5c404d020f 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMVectorize LoadStoreVectorizer.cpp + LoopIdiomTransform.cpp LoopVectorizationLegality.cpp LoopVectorize.cpp SLPVectorizer.cpp diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp similarity index 71% rename from llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp rename to llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp index 8ae3f014d45e0..c9f8189660321 100644 --- a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp @@ -1,4 +1,4 @@ -//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +//===-------- LoopIdiomTransform.cpp - Loop idiom recognition -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -35,7 +35,8 @@ // //===----------------------------------------------------------------------===// -#include "AArch64LoopIdiomTransform.h" +#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -44,47 +45,46 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "aarch64-loop-idiom-transform" +#define DEBUG_TYPE "loop-idiom-transform" -static cl::opt - DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), - cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); - -static cl::opt DisableByteCmp( - "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), - cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " - "not convert byte-compare loop(s).")); - -static cl::opt VerifyLoops( - "aarch64-lit-verify", cl::Hidden, cl::init(false), - cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); - -namespace llvm { +static cl::opt DisableAll("disable-loop-idiom-transform-all", cl::Hidden, + cl::init(false), + cl::desc("Disable Loop Idiom Transform Pass.")); -void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); -Pass *createAArch64LoopIdiomTransformPass(); +static cl::opt + DisableByteCmp("disable-loop-idiom-transform-bytecmp", cl::Hidden, + cl::init(false), + cl::desc("Proceed with Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); -} // end namespace llvm +static cl::opt + VerifyLoops("verify-loop-idiom-transform", cl::Hidden, cl::init(false), + cl::desc("Verify loops generated Loop Idiom Transform Pass.")); namespace { - -class AArch64LoopIdiomTransform { +class LoopIdiomTransform { Loop *CurLoop = nullptr; DominatorTree *DT; LoopInfo *LI; const TargetTransformInfo *TTI; const DataLayout *DL; + // Blocks that will be used for inserting vectorized code. + BasicBlock *EndBlock = nullptr; + BasicBlock *VectorLoopPreheaderBlock = nullptr; + BasicBlock *VectorLoopStartBlock = nullptr; + BasicBlock *VectorLoopMismatchBlock = nullptr; + BasicBlock *VectorLoopIncBlock = nullptr; + public: - explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, - const TargetTransformInfo *TTI, - const DataLayout *DL) + explicit LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) : DT(DT), LI(LI), TTI(TTI), DL(DL) {} bool run(Loop *L); @@ -98,83 +98,32 @@ class AArch64LoopIdiomTransform { SmallVectorImpl &ExitBlocks); bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen); + + Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, Value *ExtStart, + Value *ExtEnd); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, BasicBlock *FoundBB, BasicBlock *EndBB); /// @} }; +} // anonymous namespace -class AArch64LoopIdiomTransformLegacyPass : public LoopPass { -public: - static char ID; - - explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { - initializeAArch64LoopIdiomTransformLegacyPassPass( - *PassRegistry::getPassRegistry()); - } - - StringRef getPassName() const override { - return "Transform AArch64-specific loop idioms"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; -}; - -bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, - LPPassManager &LPM) { - - if (skipLoop(L)) - return false; - - auto *DT = &getAnalysis().getDomTree(); - auto *LI = &getAnalysis().getLoopInfo(); - auto &TTI = getAnalysis().getTTI( - *L->getHeader()->getParent()); - return AArch64LoopIdiomTransform( - DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) - .run(L); -} - -} // end anonymous namespace - -char AArch64LoopIdiomTransformLegacyPass::ID = 0; - -INITIALIZE_PASS_BEGIN( - AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", - "Transform specific loop idioms into optimized vector forms", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END( - AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", - "Transform specific loop idioms into optimized vector forms", false, false) - -Pass *llvm::createAArch64LoopIdiomTransformPass() { - return new AArch64LoopIdiomTransformLegacyPass(); -} - -PreservedAnalyses -AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &) { +PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { if (DisableAll) return PreservedAnalyses::all(); const auto *DL = &L.getHeader()->getModule()->getDataLayout(); - AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); if (!LIT.run(&L)) return PreservedAnalyses::all(); @@ -183,11 +132,11 @@ AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, //===----------------------------------------------------------------------===// // -// Implementation of AArch64LoopIdiomTransform +// Implementation of LoopIdiomTransform // //===----------------------------------------------------------------------===// -bool AArch64LoopIdiomTransform::run(Loop *L) { +bool LoopIdiomTransform::run(Loop *L) { CurLoop = L; Function &F = *L->getHeader()->getParent(); @@ -211,7 +160,7 @@ bool AArch64LoopIdiomTransform::run(Loop *L) { return recognizeByteCompare(); } -bool AArch64LoopIdiomTransform::recognizeByteCompare() { +bool LoopIdiomTransform::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed // width too. @@ -224,7 +173,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() { BasicBlock *Header = CurLoop->getHeader(); - // In AArch64LoopIdiomTransform::run we have already checked that the loop + // In LoopIdiomTransform::run we have already checked that the loop // has a preheader so we can assume it's in a canonical form. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) return false; @@ -242,8 +191,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() { // %cmp.not = icmp eq i32 %inc, %n // br i1 %cmp.not, label %while.end, label %while.body // - auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); - if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4) + if (LoopBlocks[0]->sizeWithoutDebug() > 4) return false; // The second block should contain 7 instructions, e.g. @@ -257,8 +205,7 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() { // %cmp.not.ld = icmp eq i8 %load.a, %load.b // br i1 %cmp.not.ld, label %while.cond, label %while.end // - auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); - if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7) + if (LoopBlocks[1]->sizeWithoutDebug() > 7) return false; // The incoming value to the PHI node from the loop should be an add of 1. @@ -393,7 +340,107 @@ bool AArch64LoopIdiomTransform::recognizeByteCompare() { return true; } -Value *AArch64LoopIdiomTransform::expandFindMismatch( +Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder, + GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, + Value *ExtStart, + Value *ExtEnd) { + Type *I64Type = Builder.getInt64Ty(); + Type *ResType = Builder.getInt32Ty(); + Type *LoadType = Builder.getInt8Ty(); + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= MinPageSize due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + ScalableVectorType *PredVTy = + ScalableVectorType::get(Builder.getInt1Ty(), 16); + + Value *InitialPred = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + + Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); + VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", + /*HasNUW=*/true, /*HasNSW=*/true); + + Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), + Builder.getInt1(false)); + + BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock); + Builder.Insert(JumpToVectorLoop); + + // Set up the first vector loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(VectorLoopStartBlock); + PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred"); + LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock); + PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index"); + VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock); + Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); + Value *Passthru = ConstantInt::getNullValue(VectorLoadType); + + Value *VectorLhsGep = + Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds()); + Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep, + Align(1), LoopPred, Passthru); + + Value *VectorRhsGep = + Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds()); + Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep, + Align(1), LoopPred, Passthru); + + Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad); + VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse); + Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp); + BranchInst *VectorEarlyExit = BranchInst::Create( + VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes); + Builder.Insert(VectorEarlyExit); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(VectorLoopIncBlock); + Value *NewVectorIndexPhi = + Builder.CreateAdd(VectorIndexPhi, VecLen, "", + /*HasNUW=*/true, /*HasNSW=*/true); + VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock); + Value *NewPred = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd}); + LoopPred->addIncoming(NewPred, VectorLoopIncBlock); + + Value *PredHasActiveLanes = + Builder.CreateExtractElement(NewPred, uint64_t(0)); + BranchInst *VectorLoopBranchBack = + BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes); + Builder.Insert(VectorLoopBranchBack); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(VectorLoopMismatchBlock); + PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred"); + FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock); + PHINode *LastLoopPred = + Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred"); + LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock); + PHINode *VectorFoundIndex = + Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index"); + VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock); + + Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); + Value *Ctz = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, + {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Ctz = Builder.CreateZExt(Ctz, I64Type); + Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "", + /*HasNUW=*/true, /*HasNSW=*/true); + return Builder.CreateTrunc(VectorLoopRes64, ResType); +} + +Value *LoopIdiomTransform::expandFindMismatch( IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { Value *PtrA = GEPA->getPointerOperand(); @@ -407,17 +454,16 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( Type *ResType = Builder.getInt32Ty(); // Split block in the original loop preheader. - BasicBlock *EndBlock = - SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); // Create the blocks that we're going to need: // 1. A block for checking the zero-extended length exceeds 0 // 2. A block to check that the start and end addresses of a given array // lie on the same page. - // 3. The SVE loop preheader. - // 4. The first SVE loop block. - // 5. The SVE loop increment block. - // 6. A block we can jump to from the SVE loop when a mismatch is found. + // 3. The vector loop preheader. + // 4. The first vector loop block. + // 5. The vector loop increment block. + // 6. A block we can jump to from the vector loop when a mismatch is found. // 7. The first block of the scalar loop itself, containing PHIs , loads // and cmp. // 8. A scalar loop increment block to increment the PHIs and go back @@ -432,17 +478,17 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( BasicBlock *MemCheckBlock = BasicBlock::Create( Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); - BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create( - Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock); + VectorLoopPreheaderBlock = BasicBlock::Create( + Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock); - BasicBlock *SVELoopStartBlock = BasicBlock::Create( - Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock); + VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop", + EndBlock->getParent(), EndBlock); - BasicBlock *SVELoopIncBlock = BasicBlock::Create( - Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock); + VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc", + EndBlock->getParent(), EndBlock); - BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( - Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock); + VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found", + EndBlock->getParent(), EndBlock); BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); @@ -456,26 +502,27 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, {DominatorTree::Delete, Preheader, EndBlock}}); - // Update LoopInfo with the new SVE & scalar loops. - auto SVELoop = LI->AllocateLoop(); + // Update LoopInfo with the new vector & scalar loops. + auto VectorLoop = LI->AllocateLoop(); auto ScalarLoop = LI->AllocateLoop(); if (CurLoop->getParentLoop()) { CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI); CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI); - CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI); - CurLoop->getParentLoop()->addChildLoop(SVELoop); - CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock, + *LI); + CurLoop->getParentLoop()->addChildLoop(VectorLoop); + CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI); CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); CurLoop->getParentLoop()->addChildLoop(ScalarLoop); } else { - LI->addTopLevelLoop(SVELoop); + LI->addTopLevelLoop(VectorLoop); LI->addTopLevelLoop(ScalarLoop); } // Add the new basic blocks to their associated loops. - SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI); - SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI); + VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI); + VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI); ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); @@ -497,10 +544,6 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); Builder.Insert(MinItCheckBr); - DTU.applyUpdates( - {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, - {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); - // For each of the arrays, check the start/end addresses are on the same // page. Builder.SetInsertPoint(MemCheckBlock); @@ -537,129 +580,26 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( - LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp); + LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp); CombinedPageCmpCmpBr->setMetadata( LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) .createBranchWeights(10, 90)); Builder.Insert(CombinedPageCmpCmpBr); - DTU.applyUpdates( - {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, - {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); - - // Set up the SVE loop preheader, i.e. calculate initial loop predicate, + // Set up the vector loop preheader, i.e. calculate initial loop predicate, // zero-extend MaxLen to 64-bits, determine the number of vector elements // processed in each iteration, etc. - Builder.SetInsertPoint(SVELoopPreheaderBlock); + Builder.SetInsertPoint(VectorLoopPreheaderBlock); - // At this point we know two things must be true: - // 1. Start <= End - // 2. ExtMaxLen <= MinPageSize due to the page checks. - // Therefore, we know that we can use a 64-bit induction variable that - // starts from 0 -> ExtMaxLen and it will not overflow. - ScalableVectorType *PredVTy = - ScalableVectorType::get(Builder.getInt1Ty(), 16); - - Value *InitialPred = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); - - Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); - VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", - /*HasNUW=*/true, /*HasNSW=*/true); - - Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), - Builder.getInt1(false)); - - BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock); - Builder.Insert(JumpToSVELoop); - - DTU.applyUpdates( - {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); - - // Set up the first SVE loop block by creating the PHIs, doing the vector - // loads and comparing the vectors. - Builder.SetInsertPoint(SVELoopStartBlock); - PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred"); - LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock); - PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index"); - SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock); - Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); - Value *Passthru = ConstantInt::getNullValue(SVELoadType); - - Value *SVELhsGep = - Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi, "", GEPA->isInBounds()); - Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1), - LoopPred, Passthru); - - Value *SVERhsGep = - Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi, "", GEPB->isInBounds()); - Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1), - LoopPred, Passthru); - - Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad); - SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse); - Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp); - BranchInst *SVEEarlyExit = BranchInst::Create( - SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes); - Builder.Insert(SVEEarlyExit); - - DTU.applyUpdates( - {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, - {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); - - // Increment the index counter and calculate the predicate for the next - // iteration of the loop. We branch back to the start of the loop if there - // is at least one active lane. - Builder.SetInsertPoint(SVELoopIncBlock); - Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "", - /*HasNUW=*/true, /*HasNSW=*/true); - SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock); - Value *NewPred = - Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, - {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); - LoopPred->addIncoming(NewPred, SVELoopIncBlock); - - Value *PredHasActiveLanes = - Builder.CreateExtractElement(NewPred, uint64_t(0)); - BranchInst *SVELoopBranchBack = - BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes); - Builder.Insert(SVELoopBranchBack); - - DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, - {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); - - // If we found a mismatch then we need to calculate which lane in the vector - // had a mismatch and add that on to the current loop index. - Builder.SetInsertPoint(SVELoopMismatchBlock); - PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred"); - FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock); - PHINode *LastLoopPred = - Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred"); - LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock); - PHINode *SVEFoundIndex = - Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index"); - SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock); - - Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); - Value *Ctz = Builder.CreateIntrinsic( - Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, - {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); - Ctz = Builder.CreateZExt(Ctz, I64Type); - Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "", - /*HasNUW=*/true, /*HasNSW=*/true); - Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType); + Value *VectorLoopRes = + createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd); Builder.Insert(BranchInst::Create(EndBlock)); - DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); - // Generate code for scalar loop. Builder.SetInsertPoint(LoopPreHeaderBlock); Builder.Insert(BranchInst::Create(LoopStartBlock)); - DTU.applyUpdates( - {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); - Builder.SetInsertPoint(LoopStartBlock); PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); IndexPhi->addIncoming(Start, LoopPreHeaderBlock); @@ -681,9 +621,6 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); Builder.Insert(MatchCmpBr); - DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, - {DominatorTree::Insert, LoopStartBlock, EndBlock}}); - // Have we reached the maximum permitted length for the loop? Builder.SetInsertPoint(LoopIncBlock); Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", @@ -694,29 +631,26 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); Builder.Insert(IVCmpBr); - DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, - {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); - // In the end block we need to insert a PHI node to deal with three cases: // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. // 2. We exitted the scalar loop early due to a mismatch and need to return // the index that we found. - // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen. - // 4. We exitted the SVE loop early due to a mismatch and need to return + // 3. We didn't find a mismatch in the vector loop, so we return MaxLen. + // 4. We exitted the vector loop early due to a mismatch and need to return // the index that we found. Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); ResPhi->addIncoming(MaxLen, LoopIncBlock); ResPhi->addIncoming(IndexPhi, LoopStartBlock); - ResPhi->addIncoming(MaxLen, SVELoopIncBlock); - ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock); + ResPhi->addIncoming(MaxLen, VectorLoopIncBlock); + ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock); Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType); if (VerifyLoops) { ScalarLoop->verifyLoop(); - SVELoop->verifyLoop(); - if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI)) + VectorLoop->verifyLoop(); + if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI)) report_fatal_error("Loops must remain in LCSSA form!"); if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI)) report_fatal_error("Loops must remain in LCSSA form!"); @@ -725,10 +659,12 @@ Value *AArch64LoopIdiomTransform::expandFindMismatch( return FinalRes; } -void AArch64LoopIdiomTransform::transformByteCompare( - GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, - Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, - BasicBlock *FoundBB, BasicBlock *EndBB) { +void LoopIdiomTransform::transformByteCompare(GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, + PHINode *IndPhi, Value *MaxLen, + Instruction *Index, Value *Start, + bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB) { // Insert the byte compare code at the end of the preheader block BasicBlock *Preheader = CurLoop->getLoopPreheader(); @@ -738,6 +674,11 @@ void AArch64LoopIdiomTransform::transformByteCompare( DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + // Safeguard to check if we build the correct DomTree with DTU. + auto CheckDTU = llvm::make_scope_exit([&]() { + assert(DTU.getDomTree().verify() && "Ill-formed DomTree built by DTU"); + }); + // Increment the pointer if this was done before the loads in the loop. if (IncIdx) Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); @@ -773,12 +714,8 @@ void AArch64LoopIdiomTransform::transformByteCompare( if (FoundBB != EndBB) { Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); - DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, - {DominatorTree::Insert, CmpBB, EndBB}}); - } else { Builder.CreateBr(FoundBB); - DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); } auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll index 27ab11446b571..3e73c4653902f 100644 --- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll +++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s -; RUN: opt -aarch64-lit -simplifycfg -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL -; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM -; RUN: opt -p aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s -; RUN: opt -passes='function(loop(aarch64-lit)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL -; RUN: opt -p aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM +; RUN: opt -p loop-idiom-transform -verify-loop-idiom-transform -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s +; RUN: opt -passes='function(loop(loop-idiom-transform)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL +; RUN: opt -p loop-idiom-transform -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; CHECK-LABEL: define i32 @compare_bytes_simple( @@ -33,36 +30,36 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: mismatch_vec_loop_preheader: ; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; CHECK: mismatch_sve_loop: -; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; CHECK: mismatch_vec_loop: +; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; CHECK: mismatch_sve_loop_inc: -; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; CHECK: mismatch_vec_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] -; CHECK: mismatch_sve_loop_found: -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vec_loop_found: +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; CHECK-NEXT: br label [[MISMATCH_END]] ; CHECK: mismatch_loop_pre: @@ -81,7 +78,7 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] ; CHECK: mismatch_end: -; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] @@ -128,36 +125,36 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] -; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; LOOP-DEL: mismatch_vec_loop_preheader: ; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; LOOP-DEL: mismatch_sve_loop: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; LOOP-DEL: mismatch_vec_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; LOOP-DEL: mismatch_sve_loop_inc: -; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; LOOP-DEL: mismatch_vec_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]] -; LOOP-DEL: mismatch_sve_loop_found: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_vec_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; LOOP-DEL-NEXT: br label [[WHILE_END]] ; LOOP-DEL: mismatch_loop_pre: @@ -176,7 +173,7 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] ; LOOP-DEL: while.end: -; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; LOOP-DEL-NEXT: [[RES:%.*]] = add i32 [[MISMATCH_RESULT]], [[EXTRA]] ; LOOP-DEL-NEXT: ret i32 [[RES]] ; @@ -256,36 +253,36 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vec_loop_preheader: ; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; CHECK: mismatch_sve_loop: -; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; CHECK: mismatch_vec_loop: +; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; CHECK: mismatch_sve_loop_inc: -; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; CHECK: mismatch_vec_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] -; CHECK: mismatch_sve_loop_found: -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vec_loop_found: +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; CHECK-NEXT: br label [[MISMATCH_END]] ; CHECK: mismatch_loop_pre: @@ -304,7 +301,7 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] ; CHECK: mismatch_end: -; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] @@ -349,36 +346,36 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { ; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vec_loop_preheader: ; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; LOOP-DEL: mismatch_sve_loop: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; LOOP-DEL: mismatch_vec_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; LOOP-DEL: mismatch_sve_loop_inc: -; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; LOOP-DEL: mismatch_vec_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]] -; LOOP-DEL: mismatch_sve_loop_found: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_vec_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; LOOP-DEL-NEXT: br label [[WHILE_END]] ; LOOP-DEL: mismatch_loop_pre: @@ -397,7 +394,7 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { ; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] ; LOOP-DEL: while.end: -; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]] ; ; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap( @@ -472,36 +469,36 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3 ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vec_loop_preheader: ; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; CHECK: mismatch_sve_loop: -; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; CHECK: mismatch_vec_loop: +; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; CHECK: mismatch_sve_loop_inc: -; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; CHECK: mismatch_vec_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] -; CHECK: mismatch_sve_loop_found: -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vec_loop_found: +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; CHECK-NEXT: br label [[MISMATCH_END]] ; CHECK: mismatch_loop_pre: @@ -520,7 +517,7 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3 ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] ; CHECK: mismatch_end: -; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] @@ -576,36 +573,36 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3 ; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vec_loop_preheader: ; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; LOOP-DEL: mismatch_sve_loop: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; LOOP-DEL: mismatch_vec_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; LOOP-DEL: mismatch_sve_loop_inc: -; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; LOOP-DEL: mismatch_vec_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[BYTE_COMPARE:%.*]] -; LOOP-DEL: mismatch_sve_loop_found: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[BYTE_COMPARE:%.*]] +; LOOP-DEL: mismatch_vec_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]] ; LOOP-DEL: mismatch_loop_pre: @@ -624,7 +621,7 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3 ; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]] ; LOOP-DEL: byte.compare: -; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; LOOP-DEL-NEXT: [[TMP45:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] ; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP45]], i32 [[N]], i32 [[MISMATCH_RESULT]] ; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP45]], ptr [[D]], ptr [[C]] @@ -729,36 +726,36 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { ; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vec_loop_preheader: ; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; CHECK: mismatch_sve_loop: -; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; CHECK: mismatch_vec_loop: +; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; CHECK: mismatch_sve_loop_inc: -; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; CHECK: mismatch_vec_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] -; CHECK: mismatch_sve_loop_found: -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vec_loop_found: +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; CHECK-NEXT: br label [[MISMATCH_END]] ; CHECK: mismatch_loop_pre: @@ -777,7 +774,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] ; CHECK: mismatch_end: -; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] @@ -828,36 +825,36 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { ; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] ; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] ; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] -; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vec_loop_preheader: ; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 -; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; LOOP-DEL: mismatch_sve_loop: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] -; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; LOOP-DEL: mismatch_vec_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] -; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP26]], zeroinitializer ; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) -; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; LOOP-DEL: mismatch_sve_loop_inc: -; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; LOOP-DEL: mismatch_vec_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]] ; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) ; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END]] -; LOOP-DEL: mismatch_sve_loop_found: -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[WHILE_END]] +; LOOP-DEL: mismatch_vec_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) ; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 -; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]] ; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 ; LOOP-DEL-NEXT: br label [[WHILE_END]] ; LOOP-DEL: mismatch_loop_pre: @@ -876,7 +873,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { ; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] ; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] ; LOOP-DEL: while.end: -; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] ; ; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp( @@ -960,36 +957,36 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) { ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] -; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vec_loop_preheader: ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16 -; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] -; CHECK: mismatch_sve_loop: -; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP15]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] -; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_SVE_INDEX]] -; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: br label [[MISMATCH_VEC_LOOP:%.*]] +; CHECK: mismatch_vec_loop: +; CHECK-NEXT: [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, [[MISMATCH_VEC_LOOP_PRED]], zeroinitializer) ; CHECK-NEXT: [[TMP22:%.*]] = icmp ne [[TMP19]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = select [[MISMATCH_VEC_LOOP_PRED]], [[TMP22]], zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP23]]) -; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] -; CHECK: mismatch_sve_loop_inc: -; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]] +; CHECK: mismatch_vec_loop_inc: +; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP17]] ; CHECK-NEXT: [[TMP26]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0) ; CHECK-NEXT: [[TMP27:%.*]] = extractelement [[TMP26]], i64 0 -; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] -; CHECK: mismatch_sve_loop_found: -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP23]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] -; CHECK-NEXT: [[TMP28:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vec_loop_found: +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi [ [[TMP23]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ] +; CHECK-NEXT: [[TMP28:%.*]] = and [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]] ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP28]], i1 true) ; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 ; CHECK-NEXT: br label [[MISMATCH_END]] ; CHECK: mismatch_loop_pre: @@ -1008,7 +1005,7 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) { ; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 0 ; CHECK-NEXT: br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] ; CHECK: mismatch_end: -; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_VEC_LOOP_FOUND]] ] ; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] ; CHECK: while.cond: ; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ] From 94bba1111dc22798637588868e4166884ad8c70a Mon Sep 17 00:00:00 2001 From: Min Hsu Date: Tue, 4 Jun 2024 12:01:56 -0700 Subject: [PATCH 2/4] Rename LoopIdiomTransform to LoopIdiomVectorize --- ...pIdiomTransform.h => LoopIdiomVectorize.h} | 10 +++--- llvm/lib/Passes/PassBuilder.cpp | 2 +- llvm/lib/Passes/PassRegistry.def | 2 +- .../Target/AArch64/AArch64TargetMachine.cpp | 4 +-- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 2 +- ...omTransform.cpp => LoopIdiomVectorize.cpp} | 34 +++++++++---------- .../LoopIdiom/AArch64/byte-compare-index.ll | 6 ++-- 7 files changed, 30 insertions(+), 30 deletions(-) rename llvm/include/llvm/Transforms/Vectorize/{LoopIdiomTransform.h => LoopIdiomVectorize.h} (65%) rename llvm/lib/Transforms/Vectorize/{LoopIdiomTransform.cpp => LoopIdiomVectorize.cpp} (97%) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h similarity index 65% rename from llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h rename to llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h index a97dcc7ae3a3f..56f44b7dc6b2a 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopIdiomTransform.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopIdiomVectorize.h @@ -1,4 +1,4 @@ -//===----------LoopIdiomTransform.h -----------------------------*- C++ -*-===// +//===----------LoopIdiomVectorize.h -----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,16 +6,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H -#define LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H +#ifndef LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMVECTORIZE_H +#define LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMVECTORIZE_H #include "llvm/IR/PassManager.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" namespace llvm { -struct LoopIdiomTransformPass : PassInfoMixin { +struct LoopIdiomVectorizePass : PassInfoMixin { PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; } // namespace llvm -#endif // LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMTRANSFORM_H +#endif // LLVM_LIB_TRANSFORMS_VECTORIZE_LOOPIDIOMVECTORIZE_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 76305606c74b7..9d8c3c4d7bdee 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -297,7 +297,7 @@ #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" -#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" +#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 714058f91bfc6..f71745a77a19b 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -621,7 +621,7 @@ LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("loop-bound-split", LoopBoundSplitPass()) LOOP_PASS("loop-deletion", LoopDeletionPass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) -LOOP_PASS("loop-idiom-transform", LoopIdiomTransformPass()) +LOOP_PASS("loop-idiom-vectorize", LoopIdiomVectorizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) LOOP_PASS("loop-predication", LoopPredicationPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 0d4050a8bdf2c..7de9071476e7f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -51,7 +51,7 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/CFGuard.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" +#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include #include #include @@ -554,7 +554,7 @@ void AArch64TargetMachine::registerPassBuilderCallbacks( PB.registerLateLoopOptimizationsEPCallback( [=](LoopPassManager &LPM, OptimizationLevel Level) { - LPM.addPass(LoopIdiomTransformPass()); + LPM.addPass(LoopIdiomVectorizePass()); }); } diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 3ca5c404d020f..4caec07c5ac43 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,6 +1,6 @@ add_llvm_component_library(LLVMVectorize LoadStoreVectorizer.cpp - LoopIdiomTransform.cpp + LoopIdiomVectorize.cpp LoopVectorizationLegality.cpp LoopVectorize.cpp SLPVectorizer.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp similarity index 97% rename from llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp rename to llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index c9f8189660321..e0030176ce4c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomTransform.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -1,4 +1,4 @@ -//===-------- LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +//===-------- LoopIdiomVectorize.cpp - Loop idiom recognition -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -35,7 +35,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Vectorize/LoopIdiomTransform.h" +#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopPass.h" @@ -50,24 +50,24 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "loop-idiom-transform" +#define DEBUG_TYPE "loop-idiom-vectorize" -static cl::opt DisableAll("disable-loop-idiom-transform-all", cl::Hidden, +static cl::opt DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden, cl::init(false), cl::desc("Disable Loop Idiom Transform Pass.")); static cl::opt - DisableByteCmp("disable-loop-idiom-transform-bytecmp", cl::Hidden, + DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden, cl::init(false), cl::desc("Proceed with Loop Idiom Transform Pass, but do " "not convert byte-compare loop(s).")); static cl::opt - VerifyLoops("verify-loop-idiom-transform", cl::Hidden, cl::init(false), + VerifyLoops("verify-loop-idiom-vectorize", cl::Hidden, cl::init(false), cl::desc("Verify loops generated Loop Idiom Transform Pass.")); namespace { -class LoopIdiomTransform { +class LoopIdiomVectorize { Loop *CurLoop = nullptr; DominatorTree *DT; LoopInfo *LI; @@ -82,7 +82,7 @@ class LoopIdiomTransform { BasicBlock *VectorLoopIncBlock = nullptr; public: - explicit LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI, const TargetTransformInfo *TTI, const DataLayout *DL) : DT(DT), LI(LI), TTI(TTI), DL(DL) {} @@ -115,7 +115,7 @@ class LoopIdiomTransform { }; } // anonymous namespace -PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, +PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { if (DisableAll) @@ -123,7 +123,7 @@ PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, const auto *DL = &L.getHeader()->getModule()->getDataLayout(); - LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + LoopIdiomVectorize LIT(&AR.DT, &AR.LI, &AR.TTI, DL); if (!LIT.run(&L)) return PreservedAnalyses::all(); @@ -132,11 +132,11 @@ PreservedAnalyses LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, //===----------------------------------------------------------------------===// // -// Implementation of LoopIdiomTransform +// Implementation of LoopIdiomVectorize // //===----------------------------------------------------------------------===// -bool LoopIdiomTransform::run(Loop *L) { +bool LoopIdiomVectorize::run(Loop *L) { CurLoop = L; Function &F = *L->getHeader()->getParent(); @@ -160,7 +160,7 @@ bool LoopIdiomTransform::run(Loop *L) { return recognizeByteCompare(); } -bool LoopIdiomTransform::recognizeByteCompare() { +bool LoopIdiomVectorize::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed // width too. @@ -173,7 +173,7 @@ bool LoopIdiomTransform::recognizeByteCompare() { BasicBlock *Header = CurLoop->getHeader(); - // In LoopIdiomTransform::run we have already checked that the loop + // In LoopIdiomVectorize::run we have already checked that the loop // has a preheader so we can assume it's in a canonical form. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) return false; @@ -340,7 +340,7 @@ bool LoopIdiomTransform::recognizeByteCompare() { return true; } -Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder, +Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Value *ExtStart, @@ -440,7 +440,7 @@ Value *LoopIdiomTransform::createMaskedFindMismatch(IRBuilder<> &Builder, return Builder.CreateTrunc(VectorLoopRes64, ResType); } -Value *LoopIdiomTransform::expandFindMismatch( +Value *LoopIdiomVectorize::expandFindMismatch( IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { Value *PtrA = GEPA->getPointerOperand(); @@ -659,7 +659,7 @@ Value *LoopIdiomTransform::expandFindMismatch( return FinalRes; } -void LoopIdiomTransform::transformByteCompare(GetElementPtrInst *GEPA, +void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, Value *MaxLen, Instruction *Index, Value *Start, diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll index 3e73c4653902f..d54b97fe45dd6 100644 --- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll +++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -p loop-idiom-transform -verify-loop-idiom-transform -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s -; RUN: opt -passes='function(loop(loop-idiom-transform)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL -; RUN: opt -p loop-idiom-transform -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM +; RUN: opt -p loop-idiom-vectorize -verify-loop-idiom-vectorize -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s +; RUN: opt -passes='function(loop(loop-idiom-vectorize)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL +; RUN: opt -p loop-idiom-vectorize -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { ; CHECK-LABEL: define i32 @compare_bytes_simple( From 34d0d4da377ba521cbc2f1aee9dad2f4537189c7 Mon Sep 17 00:00:00 2001 From: Min Hsu Date: Tue, 4 Jun 2024 13:31:10 -0700 Subject: [PATCH 3/4] fixup! Rename LoopIdiomTransform to LoopIdiomVectorize --- llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index e0030176ce4c7..d5443dafe1b9b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -26,6 +26,10 @@ // //===----------------------------------------------------------------------===// // +// NOTE: This Pass matches a really specific loop pattern because it's only +// supposed to be a temporary solution until our LoopVectorizer is powerful +// enought to vectorize it automatically. +// // TODO List: // // * Add support for the inverse case where we scan for a matching element. From 267750f58422e248c92f8d21870cd5086c6151f2 Mon Sep 17 00:00:00 2001 From: Min Hsu Date: Wed, 5 Jun 2024 16:25:46 -0700 Subject: [PATCH 4/4] Split refactoring and other changes into separate patches --- .../Vectorize/LoopIdiomVectorize.cpp | 271 +++++++++--------- .../LoopIdiom/AArch64/byte-compare-index.ll | 2 +- 2 files changed, 138 insertions(+), 135 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index d5443dafe1b9b..38095b1433ebe 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -1,4 +1,4 @@ -//===-------- LoopIdiomVectorize.cpp - Loop idiom recognition -------------===// +//===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -40,7 +40,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" -#include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -58,19 +57,20 @@ using namespace PatternMatch; static cl::opt DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden, cl::init(false), - cl::desc("Disable Loop Idiom Transform Pass.")); + cl::desc("Disable Loop Idiom Vectorize Pass.")); static cl::opt DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden, cl::init(false), - cl::desc("Proceed with Loop Idiom Transform Pass, but do " + cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " "not convert byte-compare loop(s).")); static cl::opt - VerifyLoops("verify-loop-idiom-vectorize", cl::Hidden, cl::init(false), - cl::desc("Verify loops generated Loop Idiom Transform Pass.")); + VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false), + cl::desc("Verify loops generated Loop Idiom Vectorize Pass.")); namespace { + class LoopIdiomVectorize { Loop *CurLoop = nullptr; DominatorTree *DT; @@ -78,13 +78,6 @@ class LoopIdiomVectorize { const TargetTransformInfo *TTI; const DataLayout *DL; - // Blocks that will be used for inserting vectorized code. - BasicBlock *EndBlock = nullptr; - BasicBlock *VectorLoopPreheaderBlock = nullptr; - BasicBlock *VectorLoopStartBlock = nullptr; - BasicBlock *VectorLoopMismatchBlock = nullptr; - BasicBlock *VectorLoopIncBlock = nullptr; - public: explicit LoopIdiomVectorize(DominatorTree *DT, LoopInfo *LI, const TargetTransformInfo *TTI, @@ -102,15 +95,9 @@ class LoopIdiomVectorize { SmallVectorImpl &ExitBlocks); bool recognizeByteCompare(); - Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen); - - Value *createMaskedFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, - GetElementPtrInst *GEPB, Value *ExtStart, - Value *ExtEnd); - void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, BasicBlock *FoundBB, @@ -344,106 +331,6 @@ bool LoopIdiomVectorize::recognizeByteCompare() { return true; } -Value *LoopIdiomVectorize::createMaskedFindMismatch(IRBuilder<> &Builder, - GetElementPtrInst *GEPA, - GetElementPtrInst *GEPB, - Value *ExtStart, - Value *ExtEnd) { - Type *I64Type = Builder.getInt64Ty(); - Type *ResType = Builder.getInt32Ty(); - Type *LoadType = Builder.getInt8Ty(); - Value *PtrA = GEPA->getPointerOperand(); - Value *PtrB = GEPB->getPointerOperand(); - - // At this point we know two things must be true: - // 1. Start <= End - // 2. ExtMaxLen <= MinPageSize due to the page checks. - // Therefore, we know that we can use a 64-bit induction variable that - // starts from 0 -> ExtMaxLen and it will not overflow. - ScalableVectorType *PredVTy = - ScalableVectorType::get(Builder.getInt1Ty(), 16); - - Value *InitialPred = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); - - Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); - VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", - /*HasNUW=*/true, /*HasNSW=*/true); - - Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), - Builder.getInt1(false)); - - BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock); - Builder.Insert(JumpToVectorLoop); - - // Set up the first vector loop block by creating the PHIs, doing the vector - // loads and comparing the vectors. - Builder.SetInsertPoint(VectorLoopStartBlock); - PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred"); - LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock); - PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index"); - VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock); - Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); - Value *Passthru = ConstantInt::getNullValue(VectorLoadType); - - Value *VectorLhsGep = - Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds()); - Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep, - Align(1), LoopPred, Passthru); - - Value *VectorRhsGep = - Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds()); - Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep, - Align(1), LoopPred, Passthru); - - Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad); - VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse); - Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp); - BranchInst *VectorEarlyExit = BranchInst::Create( - VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes); - Builder.Insert(VectorEarlyExit); - - // Increment the index counter and calculate the predicate for the next - // iteration of the loop. We branch back to the start of the loop if there - // is at least one active lane. - Builder.SetInsertPoint(VectorLoopIncBlock); - Value *NewVectorIndexPhi = - Builder.CreateAdd(VectorIndexPhi, VecLen, "", - /*HasNUW=*/true, /*HasNSW=*/true); - VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock); - Value *NewPred = - Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, - {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd}); - LoopPred->addIncoming(NewPred, VectorLoopIncBlock); - - Value *PredHasActiveLanes = - Builder.CreateExtractElement(NewPred, uint64_t(0)); - BranchInst *VectorLoopBranchBack = - BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes); - Builder.Insert(VectorLoopBranchBack); - - // If we found a mismatch then we need to calculate which lane in the vector - // had a mismatch and add that on to the current loop index. - Builder.SetInsertPoint(VectorLoopMismatchBlock); - PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred"); - FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock); - PHINode *LastLoopPred = - Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred"); - LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock); - PHINode *VectorFoundIndex = - Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index"); - VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock); - - Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); - Value *Ctz = Builder.CreateIntrinsic( - Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, - {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); - Ctz = Builder.CreateZExt(Ctz, I64Type); - Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "", - /*HasNUW=*/true, /*HasNSW=*/true); - return Builder.CreateTrunc(VectorLoopRes64, ResType); -} - Value *LoopIdiomVectorize::expandFindMismatch( IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { @@ -458,7 +345,8 @@ Value *LoopIdiomVectorize::expandFindMismatch( Type *ResType = Builder.getInt32Ty(); // Split block in the original loop preheader. - EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + BasicBlock *EndBlock = + SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); // Create the blocks that we're going to need: // 1. A block for checking the zero-extended length exceeds 0 @@ -482,17 +370,17 @@ Value *LoopIdiomVectorize::expandFindMismatch( BasicBlock *MemCheckBlock = BasicBlock::Create( Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); - VectorLoopPreheaderBlock = BasicBlock::Create( + BasicBlock *VectorLoopPreheaderBlock = BasicBlock::Create( Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock); - VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop", - EndBlock->getParent(), EndBlock); + BasicBlock *VectorLoopStartBlock = BasicBlock::Create( + Ctx, "mismatch_vec_loop", EndBlock->getParent(), EndBlock); - VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc", - EndBlock->getParent(), EndBlock); + BasicBlock *VectorLoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_vec_loop_inc", EndBlock->getParent(), EndBlock); - VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found", - EndBlock->getParent(), EndBlock); + BasicBlock *VectorLoopMismatchBlock = BasicBlock::Create( + Ctx, "mismatch_vec_loop_found", EndBlock->getParent(), EndBlock); BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); @@ -548,6 +436,10 @@ Value *LoopIdiomVectorize::expandFindMismatch( MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); Builder.Insert(MinItCheckBr); + DTU.applyUpdates( + {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, + {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); + // For each of the arrays, check the start/end addresses are on the same // page. Builder.SetInsertPoint(MemCheckBlock); @@ -590,20 +482,126 @@ Value *LoopIdiomVectorize::expandFindMismatch( .createBranchWeights(10, 90)); Builder.Insert(CombinedPageCmpCmpBr); + DTU.applyUpdates( + {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, + {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}}); + // Set up the vector loop preheader, i.e. calculate initial loop predicate, // zero-extend MaxLen to 64-bits, determine the number of vector elements // processed in each iteration, etc. Builder.SetInsertPoint(VectorLoopPreheaderBlock); - Value *VectorLoopRes = - createMaskedFindMismatch(Builder, GEPA, GEPB, ExtStart, ExtEnd); + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= MinPageSize due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + ScalableVectorType *PredVTy = + ScalableVectorType::get(Builder.getInt1Ty(), 16); + + Value *InitialPred = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + + Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); + VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", + /*HasNUW=*/true, /*HasNSW=*/true); + + Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), + Builder.getInt1(false)); + + BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock); + Builder.Insert(JumpToVectorLoop); + + DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock, + VectorLoopStartBlock}}); + + // Set up the first vector loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(VectorLoopStartBlock); + PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred"); + LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock); + PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index"); + VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock); + Type *VectorLoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); + Value *Passthru = ConstantInt::getNullValue(VectorLoadType); + + Value *VectorLhsGep = + Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds()); + Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep, + Align(1), LoopPred, Passthru); + + Value *VectorRhsGep = + Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds()); + Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep, + Align(1), LoopPred, Passthru); + + Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad); + VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse); + Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp); + BranchInst *VectorEarlyExit = BranchInst::Create( + VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes); + Builder.Insert(VectorEarlyExit); + + DTU.applyUpdates( + {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock}, + {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}}); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(VectorLoopIncBlock); + Value *NewVectorIndexPhi = + Builder.CreateAdd(VectorIndexPhi, VecLen, "", + /*HasNUW=*/true, /*HasNSW=*/true); + VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock); + Value *NewPred = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd}); + LoopPred->addIncoming(NewPred, VectorLoopIncBlock); + + Value *PredHasActiveLanes = + Builder.CreateExtractElement(NewPred, uint64_t(0)); + BranchInst *VectorLoopBranchBack = + BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes); + Builder.Insert(VectorLoopBranchBack); + + DTU.applyUpdates( + {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock}, + {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}}); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(VectorLoopMismatchBlock); + PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred"); + FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock); + PHINode *LastLoopPred = + Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred"); + LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock); + PHINode *VectorFoundIndex = + Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index"); + VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock); + + Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); + Value *Ctz = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, + {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Ctz = Builder.CreateZExt(Ctz, I64Type); + Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "", + /*HasNUW=*/true, /*HasNSW=*/true); + Value *VectorLoopRes = Builder.CreateTrunc(VectorLoopRes64, ResType); Builder.Insert(BranchInst::Create(EndBlock)); + DTU.applyUpdates( + {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}}); + // Generate code for scalar loop. Builder.SetInsertPoint(LoopPreHeaderBlock); Builder.Insert(BranchInst::Create(LoopStartBlock)); + DTU.applyUpdates( + {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); + Builder.SetInsertPoint(LoopStartBlock); PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); IndexPhi->addIncoming(Start, LoopPreHeaderBlock); @@ -625,6 +623,9 @@ Value *LoopIdiomVectorize::expandFindMismatch( BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); Builder.Insert(MatchCmpBr); + DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, + {DominatorTree::Insert, LoopStartBlock, EndBlock}}); + // Have we reached the maximum permitted length for the loop? Builder.SetInsertPoint(LoopIncBlock); Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", @@ -635,6 +636,9 @@ Value *LoopIdiomVectorize::expandFindMismatch( BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); Builder.Insert(IVCmpBr); + DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, + {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); + // In the end block we need to insert a PHI node to deal with three cases: // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. // 2. We exitted the scalar loop early due to a mismatch and need to return @@ -678,11 +682,6 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); - // Safeguard to check if we build the correct DomTree with DTU. - auto CheckDTU = llvm::make_scope_exit([&]() { - assert(DTU.getDomTree().verify() && "Ill-formed DomTree built by DTU"); - }); - // Increment the pointer if this was done before the loads in the loop. if (IncIdx) Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); @@ -718,8 +717,12 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, if (FoundBB != EndBB) { Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, + {DominatorTree::Insert, CmpBB, EndBB}}); + } else { Builder.CreateBr(FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); } auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll index d54b97fe45dd6..39037761c81bb 100644 --- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll +++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -p loop-idiom-vectorize -verify-loop-idiom-vectorize -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s +; RUN: opt -p loop-idiom-vectorize -loop-idiom-vectorize-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s ; RUN: opt -passes='function(loop(loop-idiom-vectorize)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL ; RUN: opt -p loop-idiom-vectorize -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM