[LoopIdiomVectorize] Recognize and transform minidx pattern #144987

madhur13490 · 2025-06-20T06:22:02Z

This patch vectorizes the case where the array scan happens backwards and the first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark.

The actual Fortran code:

integer function minlst(ipos1, ipos2)
       integer, intent(in) :: ipos1, ipos2
       integer :: ipos
       minlst = ipos2
       do ipos = ipos2 - 1, ipos1, -1
           if (xxtrt(ipos) < xxtrt(minlst)) then
               minlst = ipos
           end if
       end do
   end function minlst

Pre-commit test can be found as part of #141556

Vectorizing this pattern results in a 4x speedup for search sizes greater than 4096, while no noticeable regression was found for smaller sizes.

llvmbot · 2025-06-20T06:22:32Z

@llvm/pr-subscribers-vectorizers

Author: Madhur Amilkanthwar (madhur13490)

Changes

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark.

Pre-commit test can be found as part of #141556

Patch is 62.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144987.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp (+714)
(added) llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll (+291)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae0..afb6f6aea4d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -70,10 +70,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -99,6 +101,11 @@ static cl::opt<bool>
                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
                             "not convert byte-compare loop(s)."));
 
+static cl::opt<bool> DisableMinMaxlocPattern(
+    "disable-loop-idiom-vectorize-minmaxloc", cl::Hidden, cl::init(false),
+    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
+             "not convert minloc/maxloc loop(s)."));
+
 static cl::opt<unsigned>
     ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
               cl::desc("The vectorization factor for byte-compare patterns."),
@@ -149,6 +156,13 @@ class LoopIdiomVectorize {
 
   bool recognizeByteCompare();
 
+  bool recognizeMinIdxPattern();
+
+  bool transformMinIdxPattern(unsigned VF, Value *FirstIndex,
+                              Value *SecondIndex, BasicBlock *LoopPreheader,
+                              Value *BasePtr, BasicBlock *Header,
+                              BasicBlock *ExitBB, Type *LoadType);
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
@@ -239,9 +253,709 @@ bool LoopIdiomVectorize::run(Loop *L) {
   if (recognizeFindFirstByte())
     return true;
 
+  if (recognizeMinIdxPattern())
+    return true;
+
   return false;
 }
 
+bool LoopIdiomVectorize::recognizeMinIdxPattern() {
+  BasicBlock *Header = CurLoop->getHeader();
+  Function *F = Header->getParent();
+  BasicBlock *LoopPreheader = CurLoop->getLoopPreheader();
+
+  if (!TTI->supportsScalableVectors() || DisableMinMaxlocPattern) {
+    LLVM_DEBUG(dbgs() << "Does not meet pre-requisites for minidx idiom\n");
+    return false;
+  }
+
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) {
+    LLVM_DEBUG(dbgs() << "Loop does not match the required number of "
+                         "have 1 back edge and 3 blocks and backedges\n");
+    return false;
+  }
+
+  if (Header->sizeWithoutDebug() < 14) {
+    LLVM_DEBUG(dbgs() << "Header block is too small for minloc pattern\n");
+    return false;
+  }
+
+  // We need the below things to be able to transform the pattern:
+  // 1. Fist index. For this we look at the terminator instruction of
+  // the predecessor of the loop preheader. The condition of the terminator
+  // instruction decides whether to jump to scalar loop.
+  // 2. Second index.
+  // 3. Base pointer.
+  // For 2 and 3, we iterate backward from the header block to find the select
+  // instruction. The select instruction should be of the form select (fcmp
+  // contract olt loadA, loadB). Firther details below. Once we find the
+  // required pattern, we can extract the base pointer from the first load
+  // instruction
+  // 4. Exit basic block. For this we look at the terminator instruction of the
+  // header block.
+
+  // Extract the first index from the preheader.
+  // Example LLVM IR to inspect:
+  // %4 = load i32, ptr %1, align 4
+  // %5 = load i32, ptr %0, align 4
+  // %6 = sext i32 %5 to i64
+  // %7 = sub i32 0, %4
+  // %8 = sext i32 %7 to i64
+  // %9 = add nsw i64 %8, %6
+  // %10 = sub nsw i64 0, %9
+  // %invariant.gep = ...
+  // %invariant.gep1 = ...
+  // %11 = icmp slt i64 %9, 0
+  // br i1 %11, label %.loop_preheader, ...
+  Value *ICmpSLTFirstVal = nullptr, *FirstIndex = nullptr;
+  BasicBlock *LoopPreheaderBB = nullptr, *RetBB = nullptr;
+  BasicBlock *PreheaderPred = LoopPreheader->getSinglePredecessor();
+  if (!match(PreheaderPred->getTerminator(),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Value(ICmpSLTFirstVal),
+                                 m_ZeroInt()),
+                  m_BasicBlock(LoopPreheaderBB), m_BasicBlock(RetBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  // The Add operand can be either below:
+  // 1. add(sext(sub(0 - ipos2)), sext(ipos1))
+  // 2. add(sext(ipos1), sext(sub(0 - ipos2)))
+  // This depends on whether canonicalization has been done or not.
+  if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())),
+                                   (m_SExt(m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(1);
+  } else if (match(ICmpSLTFirstVal,
+                   m_Add(m_SExt(m_Value()),
+                         m_SExt(m_Sub(m_ZeroInt(), m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(0);
+  } else {
+    LLVM_DEBUG(dbgs() << "Cannot extract FirstIndex from ICmpSLTFirstVal\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "FirstIndex is " << *FirstIndex << "\n");
+
+  BasicBlock::reverse_iterator RI = Header->rbegin();
+  SelectInst *SelectToInspect = nullptr;
+  Value *BasePtr = nullptr;
+  Instruction *Trunc = nullptr;
+
+  // Iterate in backward direction to extract the select instruction which
+  // matches the pattern:
+
+  // %load1_gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  // %load1 = load float, ptr %load1_gep, align 4
+  // %load2_gep = getelementptr float, ptr ..., ...
+  // %load2 = load float, ptr %load2_gep, align 4
+  // %trunc = trunc nsw i64 %indvars.iv.next to i32
+  // %fcmp = fcmp contract olt float %load1, %load2
+  // %select = select i1 %fcmp, i32 %trunc, i32 <phi>
+  // %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  while (RI != Header->rend()) {
+    if (auto *Sel = dyn_cast<SelectInst>(&*RI)) {
+      if (match(Sel, m_Select(m_SpecificFCmp(
+                                  FCmpInst::FCMP_OLT,
+                                  m_Load(m_GEP(m_Value(BasePtr), m_Value())),
+                                  m_Load(m_GEP(m_Value(), m_Value()))),
+                              m_Instruction(Trunc), m_Value()))) {
+        SelectToInspect = Sel;
+      }
+    }
+    ++RI;
+  }
+  if (!SelectToInspect || !BasePtr) {
+    LLVM_DEBUG(dbgs() << "Select or BasePtr not found\n");
+    return false;
+  }
+
+  // Extract FCmp and validate load types
+  auto *FCmp = dyn_cast<FCmpInst>(SelectToInspect->getCondition());
+  if (!FCmp || !isa<LoadInst>(FCmp->getOperand(0)) ||
+      !isa<LoadInst>(FCmp->getOperand(1)))
+    return false;
+
+  auto *LoadA = cast<LoadInst>(FCmp->getOperand(0));
+  auto *LoadB = cast<LoadInst>(FCmp->getOperand(1));
+
+  if (LoadA->getType() != LoadB->getType()) {
+    LLVM_DEBUG(dbgs() << "Load types don't match\n");
+    return false;
+  }
+
+  // Validate truncation instruction matches expected pattern
+  TruncInst *TInst = dyn_cast<TruncInst>(Trunc);
+  if (!TInst || TInst->getDestTy() != F->getReturnType()) {
+    LLVM_DEBUG(dbgs() << "Trunc instruction validation failed\n");
+    return false;
+  }
+  // Trunc instruction's operand should be of the form (add IVPHI, -1).
+  Instruction *IVInst = nullptr;
+  if (!match(TInst->getOperand(0),
+             m_Add(m_Instruction(IVInst), m_SpecificInt(-1)))) {
+    LLVM_DEBUG(
+        dbgs() << "Trunc instruction operand doesn't match expected pattern\n");
+    return false;
+  }
+
+  PHINode *IVPhi = dyn_cast<PHINode>(IVInst);
+  if (!IVPhi) {
+    LLVM_DEBUG(dbgs() << "Add operand of trunc instruction is not a PHINode\n");
+    return false;
+  }
+
+  Value *SecondIndex = IVPhi->getIncomingValueForBlock(LoopPreheader);
+  LLVM_DEBUG(dbgs() << "SecondIndex is " << *SecondIndex << "\n");
+
+  // 4. Inspect Terminator to extract the exit block.
+  // Example LLVM IR to inspect:
+  //   %20 = icmp sgt i64 %13, 1
+  //   br i1 %20, label %.lr.ph, label %._crit_edge.loopexit
+  Value *ICmpFirstVal = nullptr;
+  BasicBlock *FalseBB = nullptr;
+  BranchInst *Terminator = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!match(Terminator, m_Br(m_SpecificICmp(ICmpInst::ICMP_SGT,
+                                             m_Value(ICmpFirstVal), m_One()),
+                              m_BasicBlock(Header), m_BasicBlock(FalseBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  unsigned VF = 128 / LoadA->getType()->getPrimitiveSizeInBits();
+
+  // We've recognized the pattern, now transform it.
+  LLVM_DEBUG(dbgs() << "FOUND MINIDX PATTERN\n");
+
+  return transformMinIdxPattern(VF, FirstIndex, SecondIndex, LoopPreheader,
+                                BasePtr, Header, FalseBB, LoadA->getType());
+}
+
+bool LoopIdiomVectorize::transformMinIdxPattern(
+    unsigned VF, Value *FirstIndex, Value *SecondIndex,
+    BasicBlock *LoopPreheader, Value *BasePtr, BasicBlock *Header,
+    BasicBlock *ExitBB, Type *LoadType) {
+
+  LLVMContext &Ctx = Header->getContext();
+  Function *F = Header->getParent();
+  Module *M = F->getParent();
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+  Type *I64Ty = Type::getInt64Ty(Ctx);
+  Type *I1Ty = Type::getInt1Ty(Ctx);
+  Type *PointerType = PointerType::get(Ctx, 0);
+  auto *MaskTy = ScalableVectorType::get(Type::getInt1Ty(Ctx), 4);
+  auto *VecTy = ScalableVectorType::get(
+      LoadType, VF); // This is the vector type for i32 values
+
+  BasicBlock *VecEntry = BasicBlock::Create(Ctx, "minidx.vec.entry", F);
+  BasicBlock *MinIdxPartial1If =
+      BasicBlock::Create(Ctx, "minidx.partial.1.if", F);
+  BasicBlock *MinIdxPartial1ProcExit =
+      BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F);
+  BasicBlock *MinIdxWhileBodyLrPh =
+      BasicBlock::Create(Ctx, "minidx.while.body.ph", F);
+  BasicBlock *MinIdxVectBody = BasicBlock::Create(Ctx, "minidx.vect.body", F);
+  BasicBlock *MinIdxVectUpdate =
+      BasicBlock::Create(Ctx, "minidx.vect.update", F);
+  BasicBlock *MinIdxVectContinue =
+      BasicBlock::Create(Ctx, "minidx.vect.continue", F);
+  BasicBlock *MinIdxVectEnd = BasicBlock::Create(Ctx, "minidx.vect.end", F);
+  BasicBlock *MinIdxPartial2If =
+      BasicBlock::Create(Ctx, "minidx.partial.2.if", F);
+  BasicBlock *MinIdxPartial2Exit =
+      BasicBlock::Create(Ctx, "minidx.partial.2.exit", F);
+  BasicBlock *MinIdxEnd = BasicBlock::Create(Ctx, "minidx.end", F);
+
+  Loop *VecLoop = LI->AllocateLoop();
+  VecLoop->addBasicBlockToLoop(MinIdxVectBody, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectUpdate, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectContinue, *LI);
+
+  LI->addTopLevelLoop(VecLoop);
+
+  // Start populating preheader.
+  IRBuilder<> Builder(LoopPreheader->getTerminator());
+  // %VScale = tail call i64 @llvm.vscale.i64()
+  // %VLen = shl nuw nsw i64 %VScale, 2
+  // %minidx.not = sub nsw i64 0, %VLen
+  // %minidx.and = and i64 %ipos2, %minidx.not
+  Value *GMax = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                          ConstantFP::getInfinity(LoadType, 0),
+                                          "minloc.gmax");
+  Value *VScale = Builder.CreateVScale(I64Ty);
+  Value *VLen =
+      Builder.CreateShl(VScale, ConstantInt::get(I64Ty, 2), "minidx.vlen");
+  Value *Not =
+      Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not");
+  // Value *Ipos2Minus1 = Builder.CreateSub(IncomingPos2,
+  // ConstantInt::get(I64Ty, 1), "minidx.ipos2.minus1");
+  Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and");
+
+  // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %ipos1)
+  // %minidx.add = add i64 %ipos2, 1
+  Value *Umax = Builder.CreateIntrinsic(
+      Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax");
+  Value *Add =
+      Builder.CreateAdd(SecondIndex, ConstantInt::get(I64Ty, 1), "minidx.add");
+  // %minidx.mask = call <vscale x 4 x i1>
+  // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %minidx.umax, i64 %minidx.add)
+  Value *MinlocMask = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask,
+                                        {MaskTy, I64Ty}),
+      {Umax, Add}, "minidx.mask");
+
+  // %minidx.add.ptr.i = getelementptr inbounds nuw float, ptr %p, i64
+  // %minidx.umax %minidx.masked.load = tail call <vscale x 4 x float>
+  // @llvm.masked.load.nxv4f32.p0(ptr %minidx.add.ptr.i, i32 1, <vscale x 4 x
+  // i1> %minidx.mask, <vscale x 4 x float> zeroinitializer) %minidx.currentVals
+  // = select <vscale x 4 x i1> %minidx.mask, <vscale x 4 x float>
+  // %minidx.masked.load, <vscale x 4 x float> splat (float 0x7FF0000000000000)
+  // %minidx.reverse = tail call <vscale x 4 x i1>
+  // @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %minidx.mask)
+  // %minidx.reverseVals = tail call <vscale x 4 x float>
+  // @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %minidx.currentVals)
+  // %minidx.minVal = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x
+  // 4 x float> %minidx.reverseVals)
+
+  Value *UmaxMinus1 =
+      Builder.CreateSub(Umax, ConstantInt::get(I64Ty, 1), "minidx.umax.minus1");
+  Value *AddPtrI = Builder.CreateInBoundsGEP(LoadType, BasePtr, UmaxMinus1,
+                                             "minidx.add.ptr.i");
+
+  Value *LoadVals =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::masked_load, {VecTy, PointerType}),
+                         {AddPtrI, ConstantInt::get(I32Ty, 1), MinlocMask,
+                          Constant::getNullValue(VecTy)},
+                         "minidx.loadVals");
+  Value *CurrentVals =
+      Builder.CreateSelect(MinlocMask, LoadVals, GMax, "minidx.currentVals");
+  Value *Reverse = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}),
+      {MinlocMask}, "minidx.reverse");
+  Value *ReverseVals = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+      {CurrentVals}, "minidx.reverseVals");
+  Value *MinVal =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::vector_reduce_fminimum, {VecTy}),
+                         {ReverseVals}, "minidx.minVal");
+
+  Builder.CreateCondBr(Builder.getTrue(), VecEntry, Header);
+  LoopPreheader->getTerminator()->eraseFromParent();
+
+  // Add edge from preheader to VecEntry
+  DTU.applyUpdates({{DominatorTree::Insert, LoopPreheader, VecEntry}});
+
+  // %minidx.entry.cmp = fcmp olt float %minidx.minVal, %init
+  // br i1 %minidx.entry.cmp, label %minidx.partial.1.if, label
+  // %minidx.partial.1.proc.exit
+  Builder.SetInsertPoint(VecEntry);
+  Value *VecEntryCmp = Builder.CreateFCmpOLT(
+      MinVal, ConstantFP::getInfinity(LoadType, 0), "minidx.entry.cmp");
+  Builder.CreateCondBr(VecEntryCmp, MinIdxPartial1If, MinIdxPartial1ProcExit);
+
+  // Connect edges from VecEntry to MinIdxPartial1If and MinIdxPartial1ProcExit
+  DTU.applyUpdates({{DominatorTree::Insert, VecEntry, MinIdxPartial1If},
+                    {DominatorTree::Insert, VecEntry, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1If);
+  // %minVal.splatinsert = insertelement <vscale x 4 x float> poison, float
+  // %minidx.minVal, i64 0 %minVal.splat = shufflevector <vscale x 4 x float>
+  // %minVal.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32>
+  // zeroinitializer
+  Value *MinValSplat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                                 MinVal, "minval.splat");
+  // %minidx.partial.1.cmp = fcmp oeq <vscale x 4 x float> %minidx.reverseVals,
+  // %minVal.splat %minidx.partial.1.and = and <vscale x 4 x i1>
+  // %minidx.reverse, %minidx.partial.1.cmp %minidx.partial.1.cttz = tail call
+  // i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+  // %minidx.partial.1.and, i1 true)
+  Value *FirstPartialCmp =
+      Builder.CreateFCmpOEQ(ReverseVals, MinValSplat, "minidx.partial.1.cmp");
+  Value *FirstPartialAnd =
+      Builder.CreateAnd(Reverse, FirstPartialCmp, "minidx.partial.1.and");
+  Value *FirstPartialCTTZ = Builder.CreateCountTrailingZeroElems(
+      I64Ty, FirstPartialAnd, ConstantInt::get(I1Ty, 1),
+      "minidx.partial.1.cttz");
+
+  // FIXME this pattern
+  // %minidx.partial.1.xor = xor i64 %minidx.partial.1.cttz, -1
+  // %minidx.partial.1.add1 = add i64 %minidx.umax, %VLen
+  // %minidx.partial.1.add2 = add i64 %minidx.partial.1.add1,
+  // %minidx.partial.1.xor br label %minidx.partial.1.proc.exit
+  Value *FirstPartialTmp1 =
+      Builder.CreateSub(VLen, FirstPartialCTTZ, "minidx.partial.1.tmp");
+  Value *FirstPartialTmp =
+      Builder.CreateSub(FirstPartialTmp1, ConstantInt::get(I64Ty, 1),
+                        "minidx.partial.1.tmp.minus1");
+  Value *FirstPartialAdd2 =
+      Builder.CreateAdd(Umax, FirstPartialTmp, "minidx.partial.1.add2");
+
+  Builder.CreateBr(MinIdxPartial1ProcExit);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1If, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1ProcExit);
+  // %minidx.partial.1.exit.known_min = phi float [ %minidx.minVal,
+  // %minidx.partial.1.if ], [ %init, %entry ] %partial1.exit.known_arg = phi
+  // i64 [ %minidx.partial.1.add2, %minidx.partial.1.if ], [ 0, %entry ]
+  PHINode *Partial1ExitKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.partial.1.exit.known_min");
+  PHINode *Partial1ExitKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "partial1.exit.known_arg");
+
+  Partial1ExitKnownMin->addIncoming(MinVal, MinIdxPartial1If);
+  Partial1ExitKnownMin->addIncoming(ConstantFP::getInfinity(LoadType, 0),
+                                    VecEntry);
+  Partial1ExitKnownArg->addIncoming(FirstPartialAdd2, MinIdxPartial1If);
+  Partial1ExitKnownArg->addIncoming(ConstantInt::get(I64Ty, 0), VecEntry);
+
+  // %minidx.partial.1.proc.exit.add = add i64 %VLen, %ipos1
+  // %minidx.partial.1.proc.exit.icmp = icmp ult i64 %minidx.umax,
+  // %minidx.partial.1.proc.exit.add br i1 %minidx.partial.1.proc.exit.icmp,
+  // label %minidx.vect.end, label %minidx.while.body.ph
+  Value *MinIdxPartial1ProcExitAdd =
+      Builder.CreateAdd(VLen, FirstIndex, "minidx.partial.1.proc.exit.add");
+  Value *MinIdxPartial1ProcExitICmp = Builder.CreateICmpULT(
+      Umax, MinIdxPartial1ProcExitAdd, "minidx.partial.1.proc.exit.icmp");
+  Builder.CreateCondBr(MinIdxPartial1ProcExitICmp, MinIdxVectEnd,
+                       MinIdxWhileBodyLrPh);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxVectEnd},
+       {DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxWhileBodyLrPh}});
+
+  Builder.SetInsertPoint(MinIdxWhileBodyLrPh);
+  // %minidx.while.body.ph.mul = mul nsw i64 %VScale, -16
+  // %minidx.while.body.ph.gep = getelementptr i8, ptr %p, i64
+  // %minidx.while.body.ph.mul br label %minidx.vect.body
+  Builder.CreateBr(MinIdxVectBody);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxWhileBodyLrPh, MinIdxVectBody}});
+
+  Builder.SetInsertPoint(MinIdxVectBody);
+  // %minidx.vect.body.phi1 = phi i64 [ %minidx.umax, %minidx.while.body.ph ], [
+  // %minidx.vect.body.sub, %minidx.vect.continue ] %minidx.vect.body.known_arg
+  // = phi i64 [ %partial1.exit.known_arg, %minidx.while.body.ph ], [
+  // %minidx.vect.continue.known_arg, %minidx.vect.continue ]
+  // %minidx.vect.body.known_min = phi float [ %minidx.partial.1.exit.known_min,
+  // %minidx.while.body.ph ], [ %minidx.vect.continue.known_min,
+  // %minidx.vect.continue ]
+  PHINode *MinIdxVectBodyPhi1 =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.phi1");
+  PHINode *MinIdxVectBodyKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.known_arg");
+  PHINode *MinIdxVectBodyKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.vect.body.known_min");
+
+  // %minidx.vect.body.sub = sub i64 %minidx.vect.body.phi1, %VLen
+  // %minidx.vect.body.shl = shl i64 %minidx.vect.body.phi1, 2
+  // %minid...
[truncated]

llvmbot · 2025-06-20T06:22:33Z

@llvm/pr-subscribers-llvm-transforms

Author: Madhur Amilkanthwar (madhur13490)

Changes

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark.

Pre-commit test can be found as part of #141556

Patch is 62.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144987.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp (+714)
(added) llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll (+291)

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae0..afb6f6aea4d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -70,10 +70,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -99,6 +101,11 @@ static cl::opt<bool>
                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
                             "not convert byte-compare loop(s)."));
 
+static cl::opt<bool> DisableMinMaxlocPattern(
+    "disable-loop-idiom-vectorize-minmaxloc", cl::Hidden, cl::init(false),
+    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
+             "not convert minloc/maxloc loop(s)."));
+
 static cl::opt<unsigned>
     ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
               cl::desc("The vectorization factor for byte-compare patterns."),
@@ -149,6 +156,13 @@ class LoopIdiomVectorize {
 
   bool recognizeByteCompare();
 
+  bool recognizeMinIdxPattern();
+
+  bool transformMinIdxPattern(unsigned VF, Value *FirstIndex,
+                              Value *SecondIndex, BasicBlock *LoopPreheader,
+                              Value *BasePtr, BasicBlock *Header,
+                              BasicBlock *ExitBB, Type *LoadType);
+
   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
                             Instruction *Index, Value *Start, Value *MaxLen);
@@ -239,9 +253,709 @@ bool LoopIdiomVectorize::run(Loop *L) {
   if (recognizeFindFirstByte())
     return true;
 
+  if (recognizeMinIdxPattern())
+    return true;
+
   return false;
 }
 
+bool LoopIdiomVectorize::recognizeMinIdxPattern() {
+  BasicBlock *Header = CurLoop->getHeader();
+  Function *F = Header->getParent();
+  BasicBlock *LoopPreheader = CurLoop->getLoopPreheader();
+
+  if (!TTI->supportsScalableVectors() || DisableMinMaxlocPattern) {
+    LLVM_DEBUG(dbgs() << "Does not meet pre-requisites for minidx idiom\n");
+    return false;
+  }
+
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) {
+    LLVM_DEBUG(dbgs() << "Loop does not match the required number of "
+                         "have 1 back edge and 3 blocks and backedges\n");
+    return false;
+  }
+
+  if (Header->sizeWithoutDebug() < 14) {
+    LLVM_DEBUG(dbgs() << "Header block is too small for minloc pattern\n");
+    return false;
+  }
+
+  // We need the below things to be able to transform the pattern:
+  // 1. Fist index. For this we look at the terminator instruction of
+  // the predecessor of the loop preheader. The condition of the terminator
+  // instruction decides whether to jump to scalar loop.
+  // 2. Second index.
+  // 3. Base pointer.
+  // For 2 and 3, we iterate backward from the header block to find the select
+  // instruction. The select instruction should be of the form select (fcmp
+  // contract olt loadA, loadB). Firther details below. Once we find the
+  // required pattern, we can extract the base pointer from the first load
+  // instruction
+  // 4. Exit basic block. For this we look at the terminator instruction of the
+  // header block.
+
+  // Extract the first index from the preheader.
+  // Example LLVM IR to inspect:
+  // %4 = load i32, ptr %1, align 4
+  // %5 = load i32, ptr %0, align 4
+  // %6 = sext i32 %5 to i64
+  // %7 = sub i32 0, %4
+  // %8 = sext i32 %7 to i64
+  // %9 = add nsw i64 %8, %6
+  // %10 = sub nsw i64 0, %9
+  // %invariant.gep = ...
+  // %invariant.gep1 = ...
+  // %11 = icmp slt i64 %9, 0
+  // br i1 %11, label %.loop_preheader, ...
+  Value *ICmpSLTFirstVal = nullptr, *FirstIndex = nullptr;
+  BasicBlock *LoopPreheaderBB = nullptr, *RetBB = nullptr;
+  BasicBlock *PreheaderPred = LoopPreheader->getSinglePredecessor();
+  if (!match(PreheaderPred->getTerminator(),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Value(ICmpSLTFirstVal),
+                                 m_ZeroInt()),
+                  m_BasicBlock(LoopPreheaderBB), m_BasicBlock(RetBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  // The Add operand can be either below:
+  // 1. add(sext(sub(0 - ipos2)), sext(ipos1))
+  // 2. add(sext(ipos1), sext(sub(0 - ipos2)))
+  // This depends on whether canonicalization has been done or not.
+  if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())),
+                                   (m_SExt(m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(1);
+  } else if (match(ICmpSLTFirstVal,
+                   m_Add(m_SExt(m_Value()),
+                         m_SExt(m_Sub(m_ZeroInt(), m_Value()))))) {
+    FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(0);
+  } else {
+    LLVM_DEBUG(dbgs() << "Cannot extract FirstIndex from ICmpSLTFirstVal\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "FirstIndex is " << *FirstIndex << "\n");
+
+  BasicBlock::reverse_iterator RI = Header->rbegin();
+  SelectInst *SelectToInspect = nullptr;
+  Value *BasePtr = nullptr;
+  Instruction *Trunc = nullptr;
+
+  // Iterate in backward direction to extract the select instruction which
+  // matches the pattern:
+
+  // %load1_gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  // %load1 = load float, ptr %load1_gep, align 4
+  // %load2_gep = getelementptr float, ptr ..., ...
+  // %load2 = load float, ptr %load2_gep, align 4
+  // %trunc = trunc nsw i64 %indvars.iv.next to i32
+  // %fcmp = fcmp contract olt float %load1, %load2
+  // %select = select i1 %fcmp, i32 %trunc, i32 <phi>
+  // %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  while (RI != Header->rend()) {
+    if (auto *Sel = dyn_cast<SelectInst>(&*RI)) {
+      if (match(Sel, m_Select(m_SpecificFCmp(
+                                  FCmpInst::FCMP_OLT,
+                                  m_Load(m_GEP(m_Value(BasePtr), m_Value())),
+                                  m_Load(m_GEP(m_Value(), m_Value()))),
+                              m_Instruction(Trunc), m_Value()))) {
+        SelectToInspect = Sel;
+      }
+    }
+    ++RI;
+  }
+  if (!SelectToInspect || !BasePtr) {
+    LLVM_DEBUG(dbgs() << "Select or BasePtr not found\n");
+    return false;
+  }
+
+  // Extract FCmp and validate load types
+  auto *FCmp = dyn_cast<FCmpInst>(SelectToInspect->getCondition());
+  if (!FCmp || !isa<LoadInst>(FCmp->getOperand(0)) ||
+      !isa<LoadInst>(FCmp->getOperand(1)))
+    return false;
+
+  auto *LoadA = cast<LoadInst>(FCmp->getOperand(0));
+  auto *LoadB = cast<LoadInst>(FCmp->getOperand(1));
+
+  if (LoadA->getType() != LoadB->getType()) {
+    LLVM_DEBUG(dbgs() << "Load types don't match\n");
+    return false;
+  }
+
+  // Validate truncation instruction matches expected pattern
+  TruncInst *TInst = dyn_cast<TruncInst>(Trunc);
+  if (!TInst || TInst->getDestTy() != F->getReturnType()) {
+    LLVM_DEBUG(dbgs() << "Trunc instruction validation failed\n");
+    return false;
+  }
+  // Trunc instruction's operand should be of the form (add IVPHI, -1).
+  Instruction *IVInst = nullptr;
+  if (!match(TInst->getOperand(0),
+             m_Add(m_Instruction(IVInst), m_SpecificInt(-1)))) {
+    LLVM_DEBUG(
+        dbgs() << "Trunc instruction operand doesn't match expected pattern\n");
+    return false;
+  }
+
+  PHINode *IVPhi = dyn_cast<PHINode>(IVInst);
+  if (!IVPhi) {
+    LLVM_DEBUG(dbgs() << "Add operand of trunc instruction is not a PHINode\n");
+    return false;
+  }
+
+  Value *SecondIndex = IVPhi->getIncomingValueForBlock(LoopPreheader);
+  LLVM_DEBUG(dbgs() << "SecondIndex is " << *SecondIndex << "\n");
+
+  // 4. Inspect Terminator to extract the exit block.
+  // Example LLVM IR to inspect:
+  //   %20 = icmp sgt i64 %13, 1
+  //   br i1 %20, label %.lr.ph, label %._crit_edge.loopexit
+  Value *ICmpFirstVal = nullptr;
+  BasicBlock *FalseBB = nullptr;
+  BranchInst *Terminator = dyn_cast<BranchInst>(Header->getTerminator());
+  if (!match(Terminator, m_Br(m_SpecificICmp(ICmpInst::ICMP_SGT,
+                                             m_Value(ICmpFirstVal), m_One()),
+                              m_BasicBlock(Header), m_BasicBlock(FalseBB)))) {
+    LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n");
+    return false;
+  }
+
+  unsigned VF = 128 / LoadA->getType()->getPrimitiveSizeInBits();
+
+  // We've recognized the pattern, now transform it.
+  LLVM_DEBUG(dbgs() << "FOUND MINIDX PATTERN\n");
+
+  return transformMinIdxPattern(VF, FirstIndex, SecondIndex, LoopPreheader,
+                                BasePtr, Header, FalseBB, LoadA->getType());
+}
+
+bool LoopIdiomVectorize::transformMinIdxPattern(
+    unsigned VF, Value *FirstIndex, Value *SecondIndex,
+    BasicBlock *LoopPreheader, Value *BasePtr, BasicBlock *Header,
+    BasicBlock *ExitBB, Type *LoadType) {
+
+  LLVMContext &Ctx = Header->getContext();
+  Function *F = Header->getParent();
+  Module *M = F->getParent();
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+  Type *I64Ty = Type::getInt64Ty(Ctx);
+  Type *I1Ty = Type::getInt1Ty(Ctx);
+  Type *PointerType = PointerType::get(Ctx, 0);
+  auto *MaskTy = ScalableVectorType::get(Type::getInt1Ty(Ctx), 4);
+  auto *VecTy = ScalableVectorType::get(
+      LoadType, VF); // This is the vector type for i32 values
+
+  BasicBlock *VecEntry = BasicBlock::Create(Ctx, "minidx.vec.entry", F);
+  BasicBlock *MinIdxPartial1If =
+      BasicBlock::Create(Ctx, "minidx.partial.1.if", F);
+  BasicBlock *MinIdxPartial1ProcExit =
+      BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F);
+  BasicBlock *MinIdxWhileBodyLrPh =
+      BasicBlock::Create(Ctx, "minidx.while.body.ph", F);
+  BasicBlock *MinIdxVectBody = BasicBlock::Create(Ctx, "minidx.vect.body", F);
+  BasicBlock *MinIdxVectUpdate =
+      BasicBlock::Create(Ctx, "minidx.vect.update", F);
+  BasicBlock *MinIdxVectContinue =
+      BasicBlock::Create(Ctx, "minidx.vect.continue", F);
+  BasicBlock *MinIdxVectEnd = BasicBlock::Create(Ctx, "minidx.vect.end", F);
+  BasicBlock *MinIdxPartial2If =
+      BasicBlock::Create(Ctx, "minidx.partial.2.if", F);
+  BasicBlock *MinIdxPartial2Exit =
+      BasicBlock::Create(Ctx, "minidx.partial.2.exit", F);
+  BasicBlock *MinIdxEnd = BasicBlock::Create(Ctx, "minidx.end", F);
+
+  Loop *VecLoop = LI->AllocateLoop();
+  VecLoop->addBasicBlockToLoop(MinIdxVectBody, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectUpdate, *LI);
+  VecLoop->addBasicBlockToLoop(MinIdxVectContinue, *LI);
+
+  LI->addTopLevelLoop(VecLoop);
+
+  // Start populating preheader.
+  IRBuilder<> Builder(LoopPreheader->getTerminator());
+  // %VScale = tail call i64 @llvm.vscale.i64()
+  // %VLen = shl nuw nsw i64 %VScale, 2
+  // %minidx.not = sub nsw i64 0, %VLen
+  // %minidx.and = and i64 %ipos2, %minidx.not
+  Value *GMax = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                          ConstantFP::getInfinity(LoadType, 0),
+                                          "minloc.gmax");
+  Value *VScale = Builder.CreateVScale(I64Ty);
+  Value *VLen =
+      Builder.CreateShl(VScale, ConstantInt::get(I64Ty, 2), "minidx.vlen");
+  Value *Not =
+      Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not");
+  // Value *Ipos2Minus1 = Builder.CreateSub(IncomingPos2,
+  // ConstantInt::get(I64Ty, 1), "minidx.ipos2.minus1");
+  Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and");
+
+  // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %ipos1)
+  // %minidx.add = add i64 %ipos2, 1
+  Value *Umax = Builder.CreateIntrinsic(
+      Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax");
+  Value *Add =
+      Builder.CreateAdd(SecondIndex, ConstantInt::get(I64Ty, 1), "minidx.add");
+  // %minidx.mask = call <vscale x 4 x i1>
+  // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %minidx.umax, i64 %minidx.add)
+  Value *MinlocMask = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask,
+                                        {MaskTy, I64Ty}),
+      {Umax, Add}, "minidx.mask");
+
+  // %minidx.add.ptr.i = getelementptr inbounds nuw float, ptr %p, i64
+  // %minidx.umax %minidx.masked.load = tail call <vscale x 4 x float>
+  // @llvm.masked.load.nxv4f32.p0(ptr %minidx.add.ptr.i, i32 1, <vscale x 4 x
+  // i1> %minidx.mask, <vscale x 4 x float> zeroinitializer) %minidx.currentVals
+  // = select <vscale x 4 x i1> %minidx.mask, <vscale x 4 x float>
+  // %minidx.masked.load, <vscale x 4 x float> splat (float 0x7FF0000000000000)
+  // %minidx.reverse = tail call <vscale x 4 x i1>
+  // @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %minidx.mask)
+  // %minidx.reverseVals = tail call <vscale x 4 x float>
+  // @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %minidx.currentVals)
+  // %minidx.minVal = call float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x
+  // 4 x float> %minidx.reverseVals)
+
+  Value *UmaxMinus1 =
+      Builder.CreateSub(Umax, ConstantInt::get(I64Ty, 1), "minidx.umax.minus1");
+  Value *AddPtrI = Builder.CreateInBoundsGEP(LoadType, BasePtr, UmaxMinus1,
+                                             "minidx.add.ptr.i");
+
+  Value *LoadVals =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::masked_load, {VecTy, PointerType}),
+                         {AddPtrI, ConstantInt::get(I32Ty, 1), MinlocMask,
+                          Constant::getNullValue(VecTy)},
+                         "minidx.loadVals");
+  Value *CurrentVals =
+      Builder.CreateSelect(MinlocMask, LoadVals, GMax, "minidx.currentVals");
+  Value *Reverse = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}),
+      {MinlocMask}, "minidx.reverse");
+  Value *ReverseVals = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}),
+      {CurrentVals}, "minidx.reverseVals");
+  Value *MinVal =
+      Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                             M, Intrinsic::vector_reduce_fminimum, {VecTy}),
+                         {ReverseVals}, "minidx.minVal");
+
+  Builder.CreateCondBr(Builder.getTrue(), VecEntry, Header);
+  LoopPreheader->getTerminator()->eraseFromParent();
+
+  // Add edge from preheader to VecEntry
+  DTU.applyUpdates({{DominatorTree::Insert, LoopPreheader, VecEntry}});
+
+  // %minidx.entry.cmp = fcmp olt float %minidx.minVal, %init
+  // br i1 %minidx.entry.cmp, label %minidx.partial.1.if, label
+  // %minidx.partial.1.proc.exit
+  Builder.SetInsertPoint(VecEntry);
+  Value *VecEntryCmp = Builder.CreateFCmpOLT(
+      MinVal, ConstantFP::getInfinity(LoadType, 0), "minidx.entry.cmp");
+  Builder.CreateCondBr(VecEntryCmp, MinIdxPartial1If, MinIdxPartial1ProcExit);
+
+  // Connect edges from VecEntry to MinIdxPartial1If and MinIdxPartial1ProcExit
+  DTU.applyUpdates({{DominatorTree::Insert, VecEntry, MinIdxPartial1If},
+                    {DominatorTree::Insert, VecEntry, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1If);
+  // %minVal.splatinsert = insertelement <vscale x 4 x float> poison, float
+  // %minidx.minVal, i64 0 %minVal.splat = shufflevector <vscale x 4 x float>
+  // %minVal.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32>
+  // zeroinitializer
+  Value *MinValSplat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
+                                                 MinVal, "minval.splat");
+  // %minidx.partial.1.cmp = fcmp oeq <vscale x 4 x float> %minidx.reverseVals,
+  // %minVal.splat %minidx.partial.1.and = and <vscale x 4 x i1>
+  // %minidx.reverse, %minidx.partial.1.cmp %minidx.partial.1.cttz = tail call
+  // i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>
+  // %minidx.partial.1.and, i1 true)
+  Value *FirstPartialCmp =
+      Builder.CreateFCmpOEQ(ReverseVals, MinValSplat, "minidx.partial.1.cmp");
+  Value *FirstPartialAnd =
+      Builder.CreateAnd(Reverse, FirstPartialCmp, "minidx.partial.1.and");
+  Value *FirstPartialCTTZ = Builder.CreateCountTrailingZeroElems(
+      I64Ty, FirstPartialAnd, ConstantInt::get(I1Ty, 1),
+      "minidx.partial.1.cttz");
+
+  // FIXME this pattern
+  // %minidx.partial.1.xor = xor i64 %minidx.partial.1.cttz, -1
+  // %minidx.partial.1.add1 = add i64 %minidx.umax, %VLen
+  // %minidx.partial.1.add2 = add i64 %minidx.partial.1.add1,
+  // %minidx.partial.1.xor br label %minidx.partial.1.proc.exit
+  Value *FirstPartialTmp1 =
+      Builder.CreateSub(VLen, FirstPartialCTTZ, "minidx.partial.1.tmp");
+  Value *FirstPartialTmp =
+      Builder.CreateSub(FirstPartialTmp1, ConstantInt::get(I64Ty, 1),
+                        "minidx.partial.1.tmp.minus1");
+  Value *FirstPartialAdd2 =
+      Builder.CreateAdd(Umax, FirstPartialTmp, "minidx.partial.1.add2");
+
+  Builder.CreateBr(MinIdxPartial1ProcExit);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1If, MinIdxPartial1ProcExit}});
+
+  Builder.SetInsertPoint(MinIdxPartial1ProcExit);
+  // %minidx.partial.1.exit.known_min = phi float [ %minidx.minVal,
+  // %minidx.partial.1.if ], [ %init, %entry ] %partial1.exit.known_arg = phi
+  // i64 [ %minidx.partial.1.add2, %minidx.partial.1.if ], [ 0, %entry ]
+  PHINode *Partial1ExitKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.partial.1.exit.known_min");
+  PHINode *Partial1ExitKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "partial1.exit.known_arg");
+
+  Partial1ExitKnownMin->addIncoming(MinVal, MinIdxPartial1If);
+  Partial1ExitKnownMin->addIncoming(ConstantFP::getInfinity(LoadType, 0),
+                                    VecEntry);
+  Partial1ExitKnownArg->addIncoming(FirstPartialAdd2, MinIdxPartial1If);
+  Partial1ExitKnownArg->addIncoming(ConstantInt::get(I64Ty, 0), VecEntry);
+
+  // %minidx.partial.1.proc.exit.add = add i64 %VLen, %ipos1
+  // %minidx.partial.1.proc.exit.icmp = icmp ult i64 %minidx.umax,
+  // %minidx.partial.1.proc.exit.add br i1 %minidx.partial.1.proc.exit.icmp,
+  // label %minidx.vect.end, label %minidx.while.body.ph
+  Value *MinIdxPartial1ProcExitAdd =
+      Builder.CreateAdd(VLen, FirstIndex, "minidx.partial.1.proc.exit.add");
+  Value *MinIdxPartial1ProcExitICmp = Builder.CreateICmpULT(
+      Umax, MinIdxPartial1ProcExitAdd, "minidx.partial.1.proc.exit.icmp");
+  Builder.CreateCondBr(MinIdxPartial1ProcExitICmp, MinIdxVectEnd,
+                       MinIdxWhileBodyLrPh);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxVectEnd},
+       {DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxWhileBodyLrPh}});
+
+  Builder.SetInsertPoint(MinIdxWhileBodyLrPh);
+  // %minidx.while.body.ph.mul = mul nsw i64 %VScale, -16
+  // %minidx.while.body.ph.gep = getelementptr i8, ptr %p, i64
+  // %minidx.while.body.ph.mul br label %minidx.vect.body
+  Builder.CreateBr(MinIdxVectBody);
+
+  DTU.applyUpdates(
+      {{DominatorTree::Insert, MinIdxWhileBodyLrPh, MinIdxVectBody}});
+
+  Builder.SetInsertPoint(MinIdxVectBody);
+  // %minidx.vect.body.phi1 = phi i64 [ %minidx.umax, %minidx.while.body.ph ], [
+  // %minidx.vect.body.sub, %minidx.vect.continue ] %minidx.vect.body.known_arg
+  // = phi i64 [ %partial1.exit.known_arg, %minidx.while.body.ph ], [
+  // %minidx.vect.continue.known_arg, %minidx.vect.continue ]
+  // %minidx.vect.body.known_min = phi float [ %minidx.partial.1.exit.known_min,
+  // %minidx.while.body.ph ], [ %minidx.vect.continue.known_min,
+  // %minidx.vect.continue ]
+  PHINode *MinIdxVectBodyPhi1 =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.phi1");
+  PHINode *MinIdxVectBodyKnownArg =
+      Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.known_arg");
+  PHINode *MinIdxVectBodyKnownMin =
+      Builder.CreatePHI(LoadType, 2, "minidx.vect.body.known_min");
+
+  // %minidx.vect.body.sub = sub i64 %minidx.vect.body.phi1, %VLen
+  // %minidx.vect.body.shl = shl i64 %minidx.vect.body.phi1, 2
+  // %minid...
[truncated]

david-arm · 2025-06-20T10:33:00Z

llvm/test/Transforms/LoopVectorize/last-min-index-ftn.ll

+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW1-IL4
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL1
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL2
+; RUN: opt -passes=loop-idiom-vectorize -S -mtriple=aarch64 -mattr=+sve %s | FileCheck %s --check-prefix=CHECK-LOOP-IDIOM


In general, loop idiom vectorize tests live in a different directory. See test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll as an example.

I think before landing the tests it's probably worth thinking about which pass/transform you actually want to use for your end goal. If your goal is to support general loop vectorisation then the tests should live in LoopVectorize and development should take place in LoopVectorizer/VPlan. The main intention of this pass is to support vectorisation of some common idioms that are unlikely to be supported in the loop vectoriser in the short-medium term. However, the ideal is still to migrate over to the loop vectoriser once that support exists.

Thanks for the review. I have moved the test. As far as I understand from the discussion on Discourse I don't think there is any plan to tackle this complicated pattern in VPlan. If there is any change in the plan since we last discussed, I would like to know. CC @fhahn @Mel-Chen

github-actions · 2025-07-01T14:15:02Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff HEAD~1 HEAD --extensions cpp -- llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp

View the diff from clang-format here.

diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index bffebd085..7690e4b68 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -315,7 +315,8 @@ bool LoopIdiomVectorize::recognizeMinIdxPattern() {
   // 1. add(sext(sub(0 - SecondIndex)), sext(FirstIndex))
   // 2. add(sext(FirstIndex), sext(sub(0 - SecondIndex)))
   // This depends on whether canonicalization has been done or not.
-  // TODO: Handle the case where there is no sign extension and return type is i64.
+  // TODO: Handle the case where there is no sign extension and return type is
+  // i64.
   if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())),
                                    (m_SExt(m_Value()))))) {
     FirstIndex = dyn_cast<Instruction>(ICmpSLTFirstVal)->getOperand(1);
@@ -451,20 +452,20 @@ bool LoopIdiomVectorize::transformMinIdxPattern(
   // High-level overview of the transformation:
   // We divide the process in three phases:
   // In the first phase, we process a chunk which is not multiple of VF.
-  // We do this by rounding down the `SecondIndex` to the nearest multiple of VF.
-  // The minimum value and the index of the minimum value are computed for this chunk.
-  // In the second phase, we process all chunks which are multiple of VF.
-  // In the third phase, we process the last chunk which is not multiple of VF.
-  // The third phase is required because the FirstIndex is necessary to start from zero
-  // thus we take max(0, FirstIndex) as the starting index.
+  // We do this by rounding down the `SecondIndex` to the nearest multiple of
+  // VF. The minimum value and the index of the minimum value are computed for
+  // this chunk. In the second phase, we process all chunks which are multiple
+  // of VF. In the third phase, we process the last chunk which is not multiple
+  // of VF. The third phase is required because the FirstIndex is necessary to
+  // start from zero thus we take max(0, FirstIndex) as the starting index.
 
   // Overview of the algorithm to compute minindex within a chunk:
   // 1. We compare the current loaded vector against a splat of infinity.
-  // 2. Further, we set the bits until we find the first set bit in the output of the
-  // above comparison. This is realized using the `cttz` intrinsic.
-  // 3. Next, we count the number of bits set and this gives us the offset from the
-  // base. The base of the chunk is updated in each phase.
-  // Step 1 and 2 are done using brkb + cnt which is realized using the `cttz` intrinsic.
+  // 2. Further, we set the bits until we find the first set bit in the output
+  // of the above comparison. This is realized using the `cttz` intrinsic.
+  // 3. Next, we count the number of bits set and this gives us the offset from
+  // the base. The base of the chunk is updated in each phase. Step 1 and 2 are
+  // done using brkb + cnt which is realized using the `cttz` intrinsic.
 
   // The below basic blocks are used to process the first phase
   // and are for processing the chunk which is not multiple of VF.
@@ -475,7 +476,7 @@ bool LoopIdiomVectorize::transformMinIdxPattern(
       BasicBlock::Create(Ctx, "minidx.partial.1.if", F);
   BasicBlock *MinIdxPartial1ProcExit =
       BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F);
-  
+
   // The below basic blocks are used to process the second phase
   // and are for processing the chunks which are multiple of VF.
   BasicBlock *MinIdxWhileBodyLrPh =
@@ -551,8 +552,8 @@ bool LoopIdiomVectorize::transformMinIdxPattern(
       Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not");
   Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and");
 
-  // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %FirstIndex)
-  // %minidx.add = add i64 %SecondIndex, 1
+  // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64
+  // %FirstIndex) %minidx.add = add i64 %SecondIndex, 1
   Value *Umax = Builder.CreateIntrinsic(
       Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax");
   Value *Add =

madhur13490 · 2025-07-01T14:17:17Z

Thanks to @rj-jesus for helping me with important pieces of this implementation!

sjoerdmeijer · 2025-07-10T10:39:13Z

This might belong in the vectoriser, but I followed the link to the discourse discussion and learned that there is an ongoing activity since 2023. I therefore think that a pattern match approach is not ideal but reasonable as a stopgap and for what it is worth there is a lot of precedent for this in this file. This pattern match is quite a bit of code, but it's fairly straightforward and self-contained. I haven't looked at this in detail, but how about progressing this, @david-arm ?

nikic · 2025-07-10T10:47:23Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

madhur13490 · 2025-07-10T10:56:43Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

Hi @nikic As explained on the discourse post, LLVM's GVN is not as sophisticated as GCC. LLVM's GVN is based on an older algorithm from the literature. Ideally, GVN should use both anticipated and available expressions, but it is currently not. Please see Thomas' thesis (https://www.cs.purdue.edu/homes/hosking/papers/vandrunen.pdf) which GCC implements.
Having said this, GVN improvement would lead 10-15% improvement. Vectorization, this patch leads to 4x improvement. I see a clear winner and thus I'd like to see if we can progress this.

sjoerdmeijer · 2025-07-10T10:58:58Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

There's quite some discussion going on there, but I think you're referring to this comment:

but it feels like the focus we should be trying to optimise the scalar IR before we attempt vectorisation. There are no stores in the loop and the bounds seem to be well-known.

My understanding is that there's two ways to optimise this:

GVN, which is the scalar optimisation we are talking about it. GCC is doing this PRE, and that's how this came up, as a case where we are behind.
But GVN and NewGVN is a difficult story, and then we found that vectorisation is actually the much better approach. The vectorisation performance gain is much bigger than the gain we get from PRE and GVN.

Isn't that right, or am I missing something, @madhur13490 ?

madhur13490 · 2025-07-10T11:02:28Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

There's quite some discussion going on there, but I think you're referring to this comment:

but it feels like the focus we should be trying to optimise the scalar IR before we attempt vectorisation. There are no stores in the loop and the bounds seem to be well-known.

My understanding is that there's two ways to optimise this:

GVN, which is the scalar optimisation we are talking about it. GCC is doing this PRE, and that's how this came up, as a case where we are behind.

But GVN and NewGVN is a difficult story, and then we found that vectorisation is actually the much better approach. The vectorisation performance gain is much bigger than the gain we get from PRE and GVN.

Isn't that right, or am I missing something, @madhur13490 ?

That's right. NewGVN is not in good shape, and @nikic said here, we may want to delete it in the future. So, I wouldn't put any eggs there.

fhahn

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

There's quite some discussion going on there, but I think you're referring to this comment:

but it feels like the focus we should be trying to optimise the scalar IR before we attempt vectorisation. There are no stores in the loop and the bounds seem to be well-known.

My understanding is that there's two ways to optimise this:
* GVN, which is the scalar optimisation we are talking about it. GCC is doing this PRE, and that's how this came up, as a case where we are behind.

* But GVN and NewGVN is a difficult story, and then we found that vectorisation is actually the much better approach. The vectorisation performance gain is much bigger than the gain we get from PRE and GVN.

W/o PRE we won't be able to handle this in LV. But we at least need to implicitly perform PRE here in LoopIdiom, so it should also be able to implement this form of PRE separately.

I've not looked at the patch in detail, but on a high level,

it looks like there are a number of legality checks missing, e.g. it doesn't check if there are any potential memory writes in the loop, and possibly misses checks if PRE is valid
hard codes scalable vector codegen
doesn't check costs of generated vector code vs original scalar loop
hard codes check for branch condition (just supporting ICMP_SGT)

I am not sure, but I suspect addressing all the above will add significant complexity & duplication compared to LV.

david-arm · 2025-07-10T12:06:57Z

My understanding is that there's two ways to optimise this:

GVN, which is the scalar optimisation we are talking about it. GCC is doing this PRE, and that's how this came up, as a case where we are behind.

But GVN and NewGVN is a difficult story, and then we found that vectorisation is actually the much better approach. The vectorisation performance gain is much bigger than the gain we get from PRE and GVN.

Isn't that right, or am I missing something, @madhur13490 ?

And the problem is that as soon as someone validly decides to teach GVN to perform load PRE in this case then this code in LoopIdiomVectorize will become dead and you'll probably see a large performance regression because the scalar code has improved. I do understand that improving GVN could take a long while, so I do see an argument for this PR, but beware of the risk. We can't block good improvements in earlier scalar passes based on the fact we no longer recognise the idiom here.

sjoerdmeijer · 2025-07-10T12:34:07Z

We can't block good improvements in earlier scalar passes based on the fact we no longer recognise the idiom here.

Agreed, of course. And this pattern matching needs to disappear anyway, long term.

I guess the main thing left to assess and investigate whether this short term solution can be salvaged is what @fhahn wrote about legality:

it looks like there are a number of legality checks missing, e.g. it doesn't check if there are any potential memory writes in the loop, and possibly misses checks if PRE is valid

The other comments look minor.

nikic · 2025-07-10T12:45:57Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

Hi @nikic As explained on the discourse post, LLVM's GVN is not as sophisticated as GCC. LLVM's GVN is based on an older algorithm from the literature. Ideally, GVN should use both anticipated and available expressions, but it is currently not. Please see Thomas' thesis (https://www.cs.purdue.edu/homes/hosking/papers/vandrunen.pdf) which GCC implements. Having said this, GVN improvement would lead 10-15% improvement. Vectorization, this patch leads to 4x improvement. I see a clear winner and thus I'd like to see if we can progress this.

It's my understanding that the GVN improvement brings this into a form that we can at least in principle vectorize, even if it doesn't work yet. It's not either/or, we want both.

madhur13490 · 2025-07-10T12:55:19Z

@sjoerdmeijer See also some related discussion on #141556. My takeaway from that was that this really needs a GVN improvement first.

Hi @nikic As explained on the discourse post, LLVM's GVN is not as sophisticated as GCC. LLVM's GVN is based on an older algorithm from the literature. Ideally, GVN should use both anticipated and available expressions, but it is currently not. Please see Thomas' thesis (https://www.cs.purdue.edu/homes/hosking/papers/vandrunen.pdf) which GCC implements. Having said this, GVN improvement would lead 10-15% improvement. Vectorization, this patch leads to 4x improvement. I see a clear winner and thus I'd like to see if we can progress this.

It's my understanding that the GVN improvement brings this into a form that we can at least in principle vectorize, even if it doesn't work yet. It's not either/or, we want both.

I am not sure if that is the right understanding. @fhahn do you think GVN (with a lot of work) optimized IR would or could be vectorizable?

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark. Pre-commit test can be found as part of llvm#141556

fhahn

It's my understanding that the GVN improvement brings this into a form that we can at least in principle vectorize, even if it doesn't work yet. It's not either/or, we want both.

I am not sure if that is the right understanding. @fhahn do you think GVN (with a lot of work) optimized IR would or could be vectorizable?

Yes, once the loads have been simplified by GVN we should be able to tackle this in LV

madhur13490 · 2025-07-16T08:18:42Z

It's my understanding that the GVN improvement brings this into a form that we can at least in principle vectorize, even if it doesn't work yet. It's not either/or, we want both.

I am not sure if that is the right understanding. @fhahn do you think GVN (with a lot of work) optimized IR would or could be vectorizable?

Yes, once the loads have been simplified by GVN we should be able to tackle this in LV

Thanks.

@fhahn @nikic
I am thinking aloud. If not GVN, then is there a place in LLVM where I can pattern match and "simplify" the loads?

 %load1_ptr = getelementptr float, ptr %first_ptr, i64 %iv
 %load1 = load float, ptr %load1_ptr, align 4
 %index_sext = sext i32 %index to i64
 %load2_ptr = getelementptr float, ptr %second_ptr, i64 %index_sext  ; <--- load the last min
 %load2 = load float, ptr %load2_ptr, align 4
 %cmp = fcmp contract olt float %load1, %load2

GCC loads the second value to a register and updates it when a minimum element is found. This is PRE to me but doing this in GVN is not feasible. Is there any other pass where I can achieve this?

FWIW, here is an equivalent C loop. https://godbolt.org/z/EcT6efd3x
GCC-generated code has a single load in the loop. Register w4 is a temporary register used to hold the last MinIdx.

nikic · 2025-07-16T10:16:54Z

@madhur13490 I think this needs to happen in two steps. The first thing you want is to get rid of the sext + trunc in Minlst IV, as these complicate the pattern. Something along these lines: https://llvm.godbolt.org/z/6co5q3Wrn I'm not really sure how to best formulate this, but in principle something like this could fit in InstCombine.

I think it would be better to focus on this simplified case for now, so we don't get lost in uninteresting sext/trunc details: https://llvm.godbolt.org/z/Gc9n3qx47

In that form, I think the pattern is a lot more straightforward, and I feel like it should be possible to extend GVN load PRE to handle it. GVN / MDA already support select dependences for the general load of select pattern. I think what is missing here is handling this case of a phi-translated select.

madhur13490 · 2025-07-16T17:25:39Z

In that form, I think the pattern is a lot more straightforward, and I feel like it should be possible to extend GVN load PRE to handle it. GVN / MDA already support select dependences for the general load of select pattern. I think what is missing here is handling this case of a phi-translated select.

I was hoping for a non-GVN solution here :)

The select case currently handled by GVN is for pointer selects; two loaded values are selected and one of them is made available. In this case, it is the index. Hypothetically, if GVN optimizes this, then the transformed would look like
https://llvm.godbolt.org/z/YbscY3jo6. @tgt is the function GVN should produce.

We need to

Hoist the load.
Insert new PHI for the minimum value.
Insert a new select for selecting the min value.

Taking a pause and reiterating. While this requires work in GVN and LV by @fhann, I still think this patch should proceed as this is available today and ready to use. We are not sure about the unknowns in GVN and LV, and how they will pan out.

Please note, this patch offers 4x speedup for the pattern.

david-arm · 2025-07-16T17:59:06Z

Please note, this patch offers 4x speedup for the pattern.

I do believe this to be true, but equally I think you also have to accept the risk of a 4x slowdown once this is finally supported in GVN as the patterns will no longer match. If this PR does go ahead, then it might be a good idea first to add similar tests in GVN. That way you have a chance at getting an early warning that work in GVN will put this idiom at risk.

llvmbot added vectorizers llvm:transforms labels Jun 20, 2025

madhur13490 mentioned this pull request Jun 20, 2025

[LoopVectorize] Add test case for minloc reduction #141556

Open

madhur13490 marked this pull request as draft June 20, 2025 07:37

david-arm reviewed Jun 20, 2025

View reviewed changes

madhur13490 force-pushed the minidx_pattern branch from d9dd731 to 00e9d5c Compare July 1, 2025 14:12

madhur13490 changed the title ~~[WIP][LoopIdiomVectorize] Recognize and transform minidx pattern~~ [LoopIdiomVectorize] Recognize and transform minidx pattern Jul 1, 2025

madhur13490 requested review from fhahn and Mel-Chen July 1, 2025 14:16

fhahn reviewed Jul 10, 2025

View reviewed changes

madhur13490 marked this pull request as ready for review July 10, 2025 13:54

madhur13490 added 3 commits July 14, 2025 20:45

[LoopIdiomVectorize] Recognize and transform minidx pattern

7ddb0d7

This patch vectorizes the case where the array scan happens backwards and first minidx is returned. Motivating example is found in rnflow FORTRAN benchmark. Pre-commit test can be found as part of llvm#141556

cleanup

15aeacc

cleanup and fix some bugs (corner cases)

99af7ed

madhur13490 force-pushed the minidx_pattern branch from 00e9d5c to 99af7ed Compare July 15, 2025 04:12

Add early exit

60185e2

fhahn reviewed Jul 16, 2025

View reviewed changes

[LoopIdiomVectorize] Recognize and transform minidx pattern #144987

Are you sure you want to change the base?

[LoopIdiomVectorize] Recognize and transform minidx pattern #144987

Uh oh!

Conversation

madhur13490 commented Jun 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 20, 2025

Uh oh!

llvmbot commented Jun 20, 2025

Uh oh!

david-arm Jun 20, 2025

Choose a reason for hiding this comment

Uh oh!

david-arm Jun 20, 2025

Choose a reason for hiding this comment

Uh oh!

madhur13490 Jul 1, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Jul 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

madhur13490 commented Jul 1, 2025

Uh oh!

sjoerdmeijer commented Jul 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic commented Jul 10, 2025

Uh oh!

madhur13490 commented Jul 10, 2025

Uh oh!

sjoerdmeijer commented Jul 10, 2025

Uh oh!

madhur13490 commented Jul 10, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

david-arm commented Jul 10, 2025

Uh oh!

sjoerdmeijer commented Jul 10, 2025

Uh oh!

nikic commented Jul 10, 2025

Uh oh!

madhur13490 commented Jul 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

madhur13490 commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

madhur13490 commented Jul 16, 2025

Uh oh!

david-arm commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

madhur13490 commented Jun 20, 2025 •

edited

Loading

github-actions bot commented Jul 1, 2025 •

edited

Loading

sjoerdmeijer commented Jul 10, 2025 •

edited

Loading

madhur13490 commented Jul 10, 2025 •

edited

Loading

madhur13490 commented Jul 16, 2025 •

edited

Loading

nikic commented Jul 16, 2025 •

edited

Loading

david-arm commented Jul 16, 2025 •

edited

Loading