From 7977537b05e7812277749293a25f27fbe6b67a40 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 15 Feb 2024 16:02:53 -0500 Subject: [PATCH 1/2] Precommit test for issue in pr81872 --- .../Transforms/LoopVectorize/X86/pr81872.ll | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr81872.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll new file mode 100644 index 0000000000000..1b21b7dbae686 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@global = external global ptr addrspace(1), align 8 + +; PR 81872 explains the issue. + +; FIXME: Miscompile where array IV and thereby value stored in (arr[99], +; arr[98]) is calculated incorrectly since disjoint or was only disjoint because +; of dominating conditions. Dropping the disjoint to avoid poison still changes +; the behaviour since now the or is not longer equivalent to the add. +; Function Attrs: uwtable +define void @test(ptr addrspace(1) noundef align 8 dereferenceable_or_null(16) %arg1) #0 { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr addrspace(1) noundef align 8 dereferenceable_or_null(16) [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb5: +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 16 +; CHECK-NEXT: br label [[BB8:%.*]] +; CHECK: bb6: +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[PHI9:%.*]], 1 +; CHECK-NEXT: [[ICMP7:%.*]] = icmp sgt i32 [[PHI9]], -2 +; CHECK-NEXT: br i1 [[ICMP7]], label [[BB10:%.*]], label [[BB8]] +; CHECK: bb8: +; CHECK-NEXT: [[PHI9]] = phi i32 [ -60516, [[BB5:%.*]] ], [ [[ADD]], [[BB6:%.*]] ] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr addrspace(1) [[GETELEMENTPTR]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP7]], i32 -3 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[TMP8]], i32 8, <4 x i1> [[REVERSE]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[BB6]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB8]] ] +; CHECK-NEXT: br label [[BB15:%.*]] +; CHECK: bb10: +; CHECK-NEXT: [[GETELEMENTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 96 +; CHECK-NEXT: [[LOAD12:%.*]] = load i64, ptr addrspace(1) [[GETELEMENTPTR11]], align 8, !noundef [[META5:![0-9]+]] +; CHECK-NEXT: [[LOAD13:%.*]] = load ptr addrspace(1), ptr @global, align 8, !invariant.load [[META5]], !nonnull [[META5]], !dereferenceable_or_null !6, !align [[META7:![0-9]+]] +; CHECK-NEXT: [[GETELEMENTPTR14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[LOAD13]], i64 848 +; CHECK-NEXT: store i64 [[LOAD12]], ptr addrspace(1) [[GETELEMENTPTR14]], align 8 +; CHECK-NEXT: ret void +; CHECK: bb15: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB20:%.*]] ] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 +; CHECK-NEXT: [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0 +; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[BB20]], !prof [[PROF8:![0-9]+]] +; CHECK: bb18: +; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[GETELEMENTPTR]], i64 [[OR]] +; CHECK-NEXT: store i64 1, ptr addrspace(1) [[GETELEMENTPTR19]], align 8 +; CHECK-NEXT: br label [[BB20]] +; CHECK: bb20: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90 +; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[BB15]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]] +; +bb5: + %getelementptr = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 16 + br label %bb8 + +bb6: ; preds = %bb20 + %add = add nsw i32 %phi9, 1 + %icmp7 = icmp sgt i32 %phi9, -2 + br i1 %icmp7, label %bb10, label %bb8 + +bb8: ; preds = %bb6, %bb5 + %phi9 = phi i32 [ -60516, %bb5 ], [ %add, %bb6 ] + br label %bb15 + +bb10: ; preds = %bb6 + %getelementptr11 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 96 + %load12 = load i64, ptr addrspace(1) %getelementptr11, align 8, !noundef !4 + %load13 = load ptr addrspace(1), ptr @global, align 8, !invariant.load !4, !nonnull !4, !dereferenceable_or_null !16, !align !17 + %getelementptr14 = getelementptr inbounds i8, ptr addrspace(1) %load13, i64 848 + store i64 %load12, ptr addrspace(1) %getelementptr14, align 8 + ret void + +bb15: ; preds = %bb20, %bb8 + %iv = phi i64 [ 99, %bb8 ], [ %iv.next, %bb20 ] + %and = and i64 %iv, 1 + %icmp17 = icmp eq i64 %and, 0 + br i1 %icmp17, label %bb18, label %bb20, !prof !21 + +bb18: ; preds = %bb15 + %or = or disjoint i64 %iv, 1 + %getelementptr19 = getelementptr inbounds i64, ptr addrspace(1) %getelementptr, i64 %or + store i64 1, ptr addrspace(1) %getelementptr19, align 8 + br label %bb20 + +bb20: ; preds = %bb18, %bb15 + %iv.next = add nsw i64 %iv, -1 + %icmp22 = icmp eq i64 %iv.next, 90 + br i1 %icmp22, label %bb6, label %bb15, !prof !22 +} + +attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" } + +!4 = !{} +!10 = !{i32 1} +!16 = !{i64 864} +!17 = !{i64 8} +!21 = !{!"branch_weights", i32 1, i32 1} +!22 = !{!"branch_weights", i32 1, i32 95} +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 127} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META5]] = !{} +; CHECK: [[META7]] = !{i64 8} +; CHECK: [[PROF8]] = !{!"branch_weights", i32 1, i32 1} +; CHECK: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]} +;. From e446dbe9d84b82441706197783dc05c9eacf488d Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 15 Feb 2024 16:10:51 -0500 Subject: [PATCH 2/2] [LoopVectorize] Fix for miscompile with disjoint or When disjoint or is disjoint because of dominating conditions, we should not vectorize such loops. This is because vectorizing it causes us to no longer have the original meaning of the instruction (it is no longer disjoint, it becomes a regular or which is not an add). Fixes pr81872. --- .../Vectorize/LoopVectorizationLegality.cpp | 11 ++++ .../Transforms/LoopVectorize/X86/pr81872.ll | 59 ++++--------------- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 37a356c43e29a..f2744b5d66a26 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -888,6 +888,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; } // end of PHI handling + // or-disjoint instruction which is predicated maybe disjoint because of a + // dominating condition. Avoid vectorizing in such cases, since dropping + // the disjoint (to avoid poison) changes the meaning of the instruction + // which was originally an add. + // TODO: We should check the terminating condition of all + // dominating blocks in the loop to see if Op0 is used for their + // terminating condition. + if (auto *Op = dyn_cast(&I)) + if (Op->isDisjoint() && blockNeedsPredication(BB)) + return false; + // We handle calls that: // * Are debug info intrinsics. // * Have a mapping to an IR intrinsic. diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll index 1b21b7dbae686..20a6732e340b7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -5,12 +5,12 @@ target triple = "x86_64-unknown-linux-gnu" @global = external global ptr addrspace(1), align 8 -; PR 81872 explains the issue. +; PR 81872 explains the issue. -; FIXME: Miscompile where array IV and thereby value stored in (arr[99], +; If we vectorize, we have a miscompile where array IV and thereby value stored in (arr[99], ; arr[98]) is calculated incorrectly since disjoint or was only disjoint because ; of dominating conditions. Dropping the disjoint to avoid poison still changes -; the behaviour since now the or is not longer equivalent to the add. +; the behaviour since now the or is not longer equivalent to the add. ; Function Attrs: uwtable define void @test(ptr addrspace(1) noundef align 8 dereferenceable_or_null(16) %arg1) #0 { ; CHECK-LABEL: define void @test( @@ -24,48 +24,19 @@ define void @test(ptr addrspace(1) noundef align 8 dereferenceable_or_null(16) % ; CHECK-NEXT: br i1 [[ICMP7]], label [[BB10:%.*]], label [[BB8]] ; CHECK: bb8: ; CHECK-NEXT: [[PHI9]] = phi i32 [ -60516, [[BB5:%.*]] ], [ [[ADD]], [[BB6:%.*]] ] -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr addrspace(1) [[GETELEMENTPTR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP7]], i32 -3 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> , ptr addrspace(1) [[TMP8]], i32 8, <4 x i1> [[REVERSE]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[BB6]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB8]] ] ; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb10: ; CHECK-NEXT: [[GETELEMENTPTR11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 96 -; CHECK-NEXT: [[LOAD12:%.*]] = load i64, ptr addrspace(1) [[GETELEMENTPTR11]], align 8, !noundef [[META5:![0-9]+]] -; CHECK-NEXT: [[LOAD13:%.*]] = load ptr addrspace(1), ptr @global, align 8, !invariant.load [[META5]], !nonnull [[META5]], !dereferenceable_or_null !6, !align [[META7:![0-9]+]] +; CHECK-NEXT: [[LOAD12:%.*]] = load i64, ptr addrspace(1) [[GETELEMENTPTR11]], align 8, !noundef [[META0:![0-9]+]] +; CHECK-NEXT: [[LOAD13:%.*]] = load ptr addrspace(1), ptr @global, align 8, !invariant.load [[META0]], !nonnull [[META0]], !dereferenceable_or_null !1, !align [[META2:![0-9]+]] ; CHECK-NEXT: [[GETELEMENTPTR14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[LOAD13]], i64 848 ; CHECK-NEXT: store i64 [[LOAD12]], ptr addrspace(1) [[GETELEMENTPTR14]], align 8 ; CHECK-NEXT: ret void ; CHECK: bb15: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB20:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 99, [[BB8]] ], [ [[IV_NEXT:%.*]], [[BB20:%.*]] ] ; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 ; CHECK-NEXT: [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0 -; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[BB20]], !prof [[PROF8:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[BB20]], !prof [[PROF3:![0-9]+]] ; CHECK: bb18: ; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 ; CHECK-NEXT: [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[GETELEMENTPTR]], i64 [[OR]] @@ -74,7 +45,7 @@ define void @test(ptr addrspace(1) noundef align 8 dereferenceable_or_null(16) % ; CHECK: bb20: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90 -; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[BB15]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[BB15]], !prof [[PROF4:![0-9]+]] ; bb5: %getelementptr = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 16 @@ -124,14 +95,8 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" } !21 = !{!"branch_weights", i32 1, i32 1} !22 = !{!"branch_weights", i32 1, i32 95} ;. -; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 127} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} -; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META5]] = !{} -; CHECK: [[META7]] = !{i64 8} -; CHECK: [[PROF8]] = !{!"branch_weights", i32 1, i32 1} -; CHECK: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]} +; CHECK: [[META0]] = !{} +; CHECK: [[META2]] = !{i64 8} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1} +; CHECK: [[PROF4]] = !{!"branch_weights", i32 1, i32 95} ;.