From 51f371377ef0a257b36bd54fdfe340d9ad33968e Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 13 Jun 2024 14:11:13 +0000 Subject: [PATCH 1/4] [mlir][LLVMIR][OpenMP] fix dominance for reduction init block It was incorrect to set the insertion point to the init block after inlining the initialization region because the code generated in the init block depends upon the value yielded from the init region. When there were multiple reduction initialization regions each with multiple blocks, this could lead to the initilization region being inlined after the init block which depends upon it. Moving the insertion point to before inlining the initialization block turned up further issues around the handling of the terminator for the initialization block, which are also fixed here. This fixes a bug in #92430 (but the affected code couldn't compile before #92430 anyway). --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 21 +- .../openmp-parallel-reduction-multiblock.mlir | 342 ++++++++++++++++++ .../LLVMIR/openmp-reduction-init-arg.mlir | 2 +- 3 files changed, 361 insertions(+), 4 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index cbfc64972f38b..9fe63a9655be2 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -388,8 +388,18 @@ static LogicalResult inlineConvertOmpRegions( // be processed multiple times. moduleTranslation.forgetMapping(region); - if (potentialTerminator && potentialTerminator->isTerminator()) - potentialTerminator->insertAfter(&builder.GetInsertBlock()->back()); + if (potentialTerminator && potentialTerminator->isTerminator()) { + llvm::BasicBlock *block = builder.GetInsertBlock(); + if (block->empty()) + // this can happen for really simple reduction init regions e.g. + // %0 = llvm.mlir.constant(0 : i32) : i32 + // omp.yield(%0 : i32) + // because the llvm.mlir.constant (MLIR op) isn't converted into any + // llvm op + potentialTerminator->insertInto(block, block->begin()); + else + potentialTerminator->insertAfter(&block->back()); + } return success(); } @@ -1171,6 +1181,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, } } + builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca()); + for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { SmallVector phis; @@ -1183,7 +1195,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, assert(phis.size() == 1 && "expected one value to be yielded from the " "reduction neutral element declaration region"); - builder.SetInsertPoint(initBlock->getTerminator()); + + // mapInitializationArg finishes its block with a terminator. We need to + // insert before that terminator. + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); if (isByRef[i]) { // Store the result of the inlined region to the allocated reduction var diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir new file mode 100644 index 0000000000000..00020bd4c9d1e --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir @@ -0,0 +1,342 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// generated by flang-new: +// subroutine missordered_blocks(x,y) +// integer, allocatable :: x, y +// !$omp parallel reduction(+:x,y) +// x = 42 +// y = 24 +// !$omp end parallel +// end subroutine + +// This is basically a test that we don't crash while translating this IR + +omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %6 = llvm.mlir.constant(0 : i64) : i64 + %7 = llvm.mlir.constant(0 : i32) : i32 + %8 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %8, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %9 = llvm.mlir.constant(1 : i64) : i64 + %10 = llvm.alloca %9 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr + %11 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %12 = llvm.load %11 : !llvm.ptr -> !llvm.ptr + %13 = llvm.ptrtoint %12 : !llvm.ptr to i64 + %14 = llvm.icmp "eq" %13, %6 : i64 + llvm.cond_br %14, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + %15 = llvm.mlir.constant(9 : i32) : i32 + %16 = llvm.mlir.zero : !llvm.ptr + %17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, i32 + %18 = llvm.ptrtoint %17 : !llvm.ptr to i64 + %19 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %20 = llvm.insertvalue %18, %19[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %21 = llvm.mlir.constant(20180515 : i32) : i32 + %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %23 = llvm.mlir.constant(0 : i32) : i32 + %24 = llvm.trunc %23 : i32 to i8 + %25 = llvm.insertvalue %24, %22[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %26 = llvm.trunc %15 : i32 to i8 + %27 = llvm.insertvalue %26, %25[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %28 = llvm.mlir.constant(2 : i32) : i32 + %29 = llvm.trunc %28 : i32 to i8 + %30 = llvm.insertvalue %29, %27[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %31 = llvm.mlir.constant(0 : i32) : i32 + %32 = llvm.trunc %31 : i32 to i8 + %33 = llvm.insertvalue %32, %30[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %34 = llvm.insertvalue %12, %33[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %34, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %35 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %35, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + llvm.br ^bb3 +^bb2: // pred: ^bb0 + %36 = llvm.mlir.zero : !llvm.ptr + %37 = llvm.getelementptr %36[1] : (!llvm.ptr) -> !llvm.ptr, i32 + %38 = llvm.ptrtoint %37 : !llvm.ptr to i64 + //%39 = llvm.call @malloc(%38) {in_type = i32, operandSegmentSizes = array} : (i64) -> !llvm.ptr + %39 = llvm.mlir.zero : !llvm.ptr + llvm.store %7, %39 : i32, !llvm.ptr + %40 = llvm.mlir.constant(9 : i32) : i32 + %41 = llvm.mlir.zero : !llvm.ptr + %42 = llvm.getelementptr %41[1] : (!llvm.ptr) -> !llvm.ptr, i32 + %43 = llvm.ptrtoint %42 : !llvm.ptr to i64 + %44 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %45 = llvm.insertvalue %43, %44[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %46 = llvm.mlir.constant(20180515 : i32) : i32 + %47 = llvm.insertvalue %46, %45[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %48 = llvm.mlir.constant(0 : i32) : i32 + %49 = llvm.trunc %48 : i32 to i8 + %50 = llvm.insertvalue %49, %47[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %51 = llvm.trunc %40 : i32 to i8 + %52 = llvm.insertvalue %51, %50[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %53 = llvm.mlir.constant(2 : i32) : i32 + %54 = llvm.trunc %53 : i32 to i8 + %55 = llvm.insertvalue %54, %52[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %56 = llvm.mlir.constant(0 : i32) : i32 + %57 = llvm.trunc %56 : i32 to i8 + %58 = llvm.insertvalue %57, %55[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %59 = llvm.insertvalue %39, %58[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %59, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %60 = llvm.load %1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %60, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + llvm.br ^bb3 +^bb3: // 2 preds: ^bb1, ^bb2 + omp.yield(%10 : !llvm.ptr) +} combiner { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %4 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %4, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %5, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %6 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr + %8 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr + %10 = llvm.load %7 : !llvm.ptr -> i32 + %11 = llvm.load %9 : !llvm.ptr -> i32 + %12 = llvm.add %10, %11 : i32 + llvm.store %12, %7 : i32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) +} cleanup { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %2 = llvm.mlir.constant(0 : i64) : i64 + %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %3, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %5 = llvm.load %4 : !llvm.ptr -> !llvm.ptr + %6 = llvm.ptrtoint %5 : !llvm.ptr to i64 + %7 = llvm.icmp "ne" %6, %2 : i64 + llvm.cond_br %7, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + //llvm.call @free(%5) : (!llvm.ptr) -> () + llvm.br ^bb2 +^bb2: // 2 preds: ^bb0, ^bb1 + omp.yield +} +llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} { + %0 = llvm.mlir.constant(24 : i32) : i32 + %1 = llvm.mlir.constant(42 : i32) : i32 + omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) { + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i32) : i32 + %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %6 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %6, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %7 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr + llvm.store %1, %8 : i32, !llvm.ptr + %9 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %9, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %10 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + llvm.store %0, %11 : i32, !llvm.ptr + omp.terminator + } + llvm.return +} + + +// CHECK: %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8 +// CHECK: br label %[[VAL_1:.*]] +// CHECK: entry: ; preds = %[[VAL_2:.*]] +// CHECK: %[[VAL_3:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: br label %[[VAL_4:.*]] +// CHECK: omp_parallel: ; preds = %[[VAL_1]] +// CHECK: %[[VAL_5:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 0 +// CHECK: store ptr %[[VAL_6:.*]], ptr %[[VAL_5]], align 8 +// CHECK: %[[VAL_7:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 1 +// CHECK: store ptr %[[VAL_8:.*]], ptr %[[VAL_7]], align 8 +// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @missordered_blocks_..omp_par, ptr %[[VAL_0]]) +// CHECK: br label %[[VAL_9:.*]] +// CHECK: omp.par.outlined.exit: ; preds = %[[VAL_4]] +// CHECK: br label %[[VAL_10:.*]] +// CHECK: omp.par.exit.split: ; preds = %[[VAL_9]] +// CHECK: ret void +// CHECK: omp.par.entry: +// CHECK: %[[VAL_11:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12:.*]], i32 0, i32 0 +// CHECK: %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8 +// CHECK: %[[VAL_14:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12]], i32 0, i32 1 +// CHECK: %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8 +// CHECK: %[[VAL_16:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4 +// CHECK: store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4 +// CHECK: %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4 +// CHECK: %[[VAL_20:.*]] = alloca ptr, align 8 +// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8 +// CHECK: %[[VAL_22:.*]] = alloca [2 x ptr], align 8 +// CHECK: br label %[[VAL_23:.*]] +// CHECK: omp.reduction.init: ; preds = %[[VAL_24:.*]] +// CHECK: br label %[[VAL_25:.*]] +// CHECK: omp.reduction.neutral: ; preds = %[[VAL_23]] +// CHECK: %[[VAL_26:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_27:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_28:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_29:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_29]], ptr %[[VAL_28]], align 8 +// CHECK: %[[VAL_30:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 +// CHECK: %[[VAL_31:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_28]], i32 0, i32 0 +// CHECK: %[[VAL_32:.*]] = load ptr, ptr %[[VAL_31]], align 8 +// CHECK: %[[VAL_33:.*]] = ptrtoint ptr %[[VAL_32]] to i64 +// CHECK: %[[VAL_34:.*]] = icmp eq i64 %[[VAL_33]], 0 +// CHECK: br i1 %[[VAL_34]], label %[[VAL_35:.*]], label %[[VAL_36:.*]] +// CHECK: omp.reduction.neutral2: ; preds = %[[VAL_25]] +// CHECK: store i32 0, ptr null, align 4 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_26]], align 8 +// CHECK: %[[VAL_37:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_26]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_37]], ptr %[[VAL_30]], align 8 +// CHECK: br label %[[VAL_38:.*]] +// CHECK: omp.reduction.neutral3: ; preds = %[[VAL_35]], %[[VAL_36]] +// CHECK: br label %[[VAL_39:.*]] +// CHECK: omp.region.cont: ; preds = %[[VAL_38]] +// CHECK: %[[VAL_40:.*]] = phi ptr [ %[[VAL_30]], %[[VAL_38]] ] +// CHECK: store ptr %[[VAL_40]], ptr %[[VAL_20]], align 8 +// CHECK: br label %[[VAL_41:.*]] +// CHECK: omp.reduction.neutral5: ; preds = %[[VAL_39]] +// CHECK: %[[VAL_42:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_43:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_44:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_45:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_45]], ptr %[[VAL_44]], align 8 +// CHECK: %[[VAL_46:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 +// CHECK: %[[VAL_47:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_44]], i32 0, i32 0 +// CHECK: %[[VAL_48:.*]] = load ptr, ptr %[[VAL_47]], align 8 +// CHECK: %[[VAL_49:.*]] = ptrtoint ptr %[[VAL_48]] to i64 +// CHECK: %[[VAL_50:.*]] = icmp eq i64 %[[VAL_49]], 0 +// CHECK: br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]] +// CHECK: omp.reduction.neutral7: ; preds = %[[VAL_41]] +// CHECK: store i32 0, ptr null, align 4 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_42]], align 8 +// CHECK: %[[VAL_53:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_42]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_53]], ptr %[[VAL_46]], align 8 +// CHECK: br label %[[VAL_54:.*]] +// CHECK: omp.reduction.neutral8: ; preds = %[[VAL_51]], %[[VAL_52]] +// CHECK: br label %[[VAL_55:.*]] +// CHECK: omp.region.cont4: ; preds = %[[VAL_54]] +// CHECK: %[[VAL_56:.*]] = phi ptr [ %[[VAL_46]], %[[VAL_54]] ] +// CHECK: store ptr %[[VAL_56]], ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_57:.*]] +// CHECK: omp.par.region: ; preds = %[[VAL_55]] +// CHECK: br label %[[VAL_58:.*]] +// CHECK: omp.par.region10: ; preds = %[[VAL_57]] +// CHECK: %[[VAL_59:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_60:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_61:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_40]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_61]], ptr %[[VAL_60]], align 8 +// CHECK: %[[VAL_62:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_60]], i32 0, i32 0 +// CHECK: %[[VAL_63:.*]] = load ptr, ptr %[[VAL_62]], align 8 +// CHECK: store i32 42, ptr %[[VAL_63]], align 4 +// CHECK: %[[VAL_64:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_56]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_64]], ptr %[[VAL_59]], align 8 +// CHECK: %[[VAL_65:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_59]], i32 0, i32 0 +// CHECK: %[[VAL_66:.*]] = load ptr, ptr %[[VAL_65]], align 8 +// CHECK: store i32 24, ptr %[[VAL_66]], align 4 +// CHECK: br label %[[VAL_67:.*]] +// CHECK: omp.region.cont9: ; preds = %[[VAL_58]] +// CHECK: %[[VAL_68:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 0 +// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_68]], align 8 +// CHECK: %[[VAL_69:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 1 +// CHECK: store ptr %[[VAL_21]], ptr %[[VAL_69]], align 8 +// CHECK: %[[VAL_70:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[VAL_71:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_70]], i32 2, i64 16, ptr %[[VAL_22]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var) +// CHECK: switch i32 %[[VAL_71]], label %[[VAL_72:.*]] [ +// CHECK: i32 1, label %[[VAL_73:.*]] +// CHECK: i32 2, label %[[VAL_74:.*]] +// CHECK: ] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_67]] +// CHECK: unreachable +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_67]] +// CHECK: %[[VAL_75:.*]] = load ptr, ptr %[[VAL_20]], align 8 +// CHECK: %[[VAL_76:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_77:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_78:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_78]], ptr %[[VAL_77]], align 8 +// CHECK: %[[VAL_79:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_75]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_79]], ptr %[[VAL_76]], align 8 +// CHECK: %[[VAL_80:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_77]], i32 0, i32 0 +// CHECK: %[[VAL_81:.*]] = load ptr, ptr %[[VAL_80]], align 8 +// CHECK: %[[VAL_82:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_76]], i32 0, i32 0 +// CHECK: %[[VAL_83:.*]] = load ptr, ptr %[[VAL_82]], align 8 +// CHECK: %[[VAL_84:.*]] = load i32, ptr %[[VAL_81]], align 4 +// CHECK: %[[VAL_85:.*]] = load i32, ptr %[[VAL_83]], align 4 +// CHECK: %[[VAL_86:.*]] = add i32 %[[VAL_84]], %[[VAL_85]] +// CHECK: store i32 %[[VAL_86]], ptr %[[VAL_81]], align 4 +// CHECK: %[[VAL_87:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: %[[VAL_88:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_89:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_90:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_90]], ptr %[[VAL_89]], align 8 +// CHECK: %[[VAL_91:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_87]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_91]], ptr %[[VAL_88]], align 8 +// CHECK: %[[VAL_92:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_89]], i32 0, i32 0 +// CHECK: %[[VAL_93:.*]] = load ptr, ptr %[[VAL_92]], align 8 +// CHECK: %[[VAL_94:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_88]], i32 0, i32 0 +// CHECK: %[[VAL_95:.*]] = load ptr, ptr %[[VAL_94]], align 8 +// CHECK: %[[VAL_96:.*]] = load i32, ptr %[[VAL_93]], align 4 +// CHECK: %[[VAL_97:.*]] = load i32, ptr %[[VAL_95]], align 4 +// CHECK: %[[VAL_98:.*]] = add i32 %[[VAL_96]], %[[VAL_97]] +// CHECK: store i32 %[[VAL_98]], ptr %[[VAL_93]], align 4 +// CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_70]], ptr @.gomp_critical_user_.reduction.var) +// CHECK: br label %[[VAL_72]] +// CHECK: reduce.finalize: ; preds = %[[VAL_73]], %[[VAL_67]] +// CHECK: br label %[[VAL_99:.*]] +// CHECK: omp.par.pre_finalize: ; preds = %[[VAL_72]] +// CHECK: %[[VAL_100:.*]] = load ptr, ptr %[[VAL_20]], align 8 +// CHECK: br label %[[VAL_101:.*]] +// CHECK: omp.reduction.cleanup: ; preds = %[[VAL_99]] +// CHECK: %[[VAL_102:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_103:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_100]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_103]], ptr %[[VAL_102]], align 8 +// CHECK: %[[VAL_104:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_102]], i32 0, i32 0 +// CHECK: %[[VAL_105:.*]] = load ptr, ptr %[[VAL_104]], align 8 +// CHECK: %[[VAL_106:.*]] = ptrtoint ptr %[[VAL_105]] to i64 +// CHECK: %[[VAL_107:.*]] = icmp ne i64 %[[VAL_106]], 0 +// CHECK: br i1 %[[VAL_107]], label %[[VAL_108:.*]], label %[[VAL_109:.*]] +// CHECK: omp.reduction.cleanup14: ; preds = %[[VAL_108]], %[[VAL_101]] +// CHECK: br label %[[VAL_110:.*]] +// CHECK: omp.region.cont12: ; preds = %[[VAL_109]] +// CHECK: %[[VAL_111:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_112:.*]] +// CHECK: omp.reduction.cleanup16: ; preds = %[[VAL_110]] +// CHECK: %[[VAL_113:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 +// CHECK: %[[VAL_114:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_111]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_114]], ptr %[[VAL_113]], align 8 +// CHECK: %[[VAL_115:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_113]], i32 0, i32 0 +// CHECK: %[[VAL_116:.*]] = load ptr, ptr %[[VAL_115]], align 8 +// CHECK: %[[VAL_117:.*]] = ptrtoint ptr %[[VAL_116]] to i64 +// CHECK: %[[VAL_118:.*]] = icmp ne i64 %[[VAL_117]], 0 +// CHECK: br i1 %[[VAL_118]], label %[[VAL_119:.*]], label %[[VAL_120:.*]] +// CHECK: omp.reduction.cleanup18: ; preds = %[[VAL_119]], %[[VAL_112]] +// CHECK: br label %[[VAL_121:.*]] +// CHECK: omp.region.cont15: ; preds = %[[VAL_120]] +// CHECK: br label %[[VAL_122:.*]] +// CHECK: omp.reduction.cleanup17: ; preds = %[[VAL_112]] +// CHECK: br label %[[VAL_120]] +// CHECK: omp.reduction.cleanup13: ; preds = %[[VAL_101]] +// CHECK: br label %[[VAL_109]] +// CHECK: omp.reduction.neutral6: ; preds = %[[VAL_41]] +// CHECK: %[[VAL_123:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_48]], 0 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_123]], ptr %[[VAL_43]], align 8 +// CHECK: %[[VAL_124:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_43]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_124]], ptr %[[VAL_46]], align 8 +// CHECK: br label %[[VAL_54]] +// CHECK: omp.reduction.neutral1: ; preds = %[[VAL_25]] +// CHECK: %[[VAL_125:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_32]], 0 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_125]], ptr %[[VAL_27]], align 8 +// CHECK: %[[VAL_126:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_27]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_126]], ptr %[[VAL_30]], align 8 +// CHECK: br label %[[VAL_38]] +// CHECK: omp.par.outlined.exit.exitStub: ; preds = %[[VAL_121]] +// CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir index 361905f7cddeb..0f757de39a006 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir @@ -61,10 +61,10 @@ module { // CHECK: %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4 // CHECK: %[[VAL_21:.*]] = alloca ptr, align 8 // CHECK: %[[VAL_23:.*]] = alloca ptr, align 8 -// CHECK: %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8 // CHECK: %[[VAL_24:.*]] = alloca [2 x ptr], align 8 // CHECK: br label %[[INIT_LABEL:.*]] // CHECK: [[INIT_LABEL]]: +// CHECK: %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8 // CHECK: store ptr %[[VAL_13]], ptr %[[VAL_21]], align 8 // CHECK: %[[VAL_22:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_15]], align 8 // CHECK: store ptr %[[VAL_15]], ptr %[[VAL_23]], align 8 From 5b39edff5d21242e3af64d863ac68cdd6b458f29 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 19 Jun 2024 13:00:44 +0000 Subject: [PATCH 2/4] Add braces --- .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 9fe63a9655be2..7793d5da952ef 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -390,15 +390,16 @@ static LogicalResult inlineConvertOmpRegions( if (potentialTerminator && potentialTerminator->isTerminator()) { llvm::BasicBlock *block = builder.GetInsertBlock(); - if (block->empty()) + if (block->empty()) { // this can happen for really simple reduction init regions e.g. // %0 = llvm.mlir.constant(0 : i32) : i32 // omp.yield(%0 : i32) // because the llvm.mlir.constant (MLIR op) isn't converted into any // llvm op potentialTerminator->insertInto(block, block->begin()); - else + } else { potentialTerminator->insertAfter(&block->back()); + } } return success(); From 2be9ad26b63872eea8610f575ec13b4956be04af Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 19 Jun 2024 13:13:37 +0000 Subject: [PATCH 3/4] canonicalize and cse test --- .../openmp-parallel-reduction-multiblock.mlir | 194 ++++++++---------- 1 file changed, 86 insertions(+), 108 deletions(-) diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir index 00020bd4c9d1e..4952b15287f81 100644 --- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir +++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir @@ -13,142 +13,120 @@ omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init { ^bb0(%arg0: !llvm.ptr): - %0 = llvm.mlir.constant(1 : i32) : i32 - %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %4 = llvm.mlir.constant(1 : i32) : i32 - %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %6 = llvm.mlir.constant(0 : i64) : i64 - %7 = llvm.mlir.constant(0 : i32) : i32 - %8 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %8, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %9 = llvm.mlir.constant(1 : i64) : i64 - %10 = llvm.alloca %9 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr - %11 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %12 = llvm.load %11 : !llvm.ptr -> !llvm.ptr - %13 = llvm.ptrtoint %12 : !llvm.ptr to i64 - %14 = llvm.icmp "eq" %13, %6 : i64 - llvm.cond_br %14, ^bb1, ^bb2 + %0 = llvm.mlir.constant(2 : i32) : i32 + %1 = llvm.mlir.constant(20180515 : i32) : i32 + %2 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %3 = llvm.mlir.zero : !llvm.ptr + %4 = llvm.mlir.constant(9 : i32) : i32 + %5 = llvm.mlir.constant(1 : i64) : i64 + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(0 : i64) : i64 + %8 = llvm.mlir.constant(1 : i32) : i32 + %9 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %10 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %11 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %12 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %12, %11 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %13 = llvm.alloca %5 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr + %14 = llvm.getelementptr %11[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr + %16 = llvm.ptrtoint %15 : !llvm.ptr to i64 + %17 = llvm.icmp "eq" %16, %7 : i64 + llvm.cond_br %17, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - %15 = llvm.mlir.constant(9 : i32) : i32 - %16 = llvm.mlir.zero : !llvm.ptr - %17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, i32 - %18 = llvm.ptrtoint %17 : !llvm.ptr to i64 - %19 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %20 = llvm.insertvalue %18, %19[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %21 = llvm.mlir.constant(20180515 : i32) : i32 - %22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %23 = llvm.mlir.constant(0 : i32) : i32 - %24 = llvm.trunc %23 : i32 to i8 - %25 = llvm.insertvalue %24, %22[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %26 = llvm.trunc %15 : i32 to i8 - %27 = llvm.insertvalue %26, %25[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %28 = llvm.mlir.constant(2 : i32) : i32 - %29 = llvm.trunc %28 : i32 to i8 - %30 = llvm.insertvalue %29, %27[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %31 = llvm.mlir.constant(0 : i32) : i32 - %32 = llvm.trunc %31 : i32 to i8 - %33 = llvm.insertvalue %32, %30[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %34 = llvm.insertvalue %12, %33[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %34, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %35 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %35, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %18 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32 + %19 = llvm.ptrtoint %18 : !llvm.ptr to i64 + %20 = llvm.insertvalue %19, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %21 = llvm.insertvalue %1, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %22 = llvm.trunc %6 : i32 to i8 + %23 = llvm.insertvalue %22, %21[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %24 = llvm.trunc %4 : i32 to i8 + %25 = llvm.insertvalue %24, %23[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %26 = llvm.trunc %0 : i32 to i8 + %27 = llvm.insertvalue %26, %25[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %28 = llvm.insertvalue %22, %27[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %29 = llvm.insertvalue %15, %28[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %29, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %30 = llvm.load %10 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %30, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr llvm.br ^bb3 ^bb2: // pred: ^bb0 - %36 = llvm.mlir.zero : !llvm.ptr - %37 = llvm.getelementptr %36[1] : (!llvm.ptr) -> !llvm.ptr, i32 - %38 = llvm.ptrtoint %37 : !llvm.ptr to i64 - //%39 = llvm.call @malloc(%38) {in_type = i32, operandSegmentSizes = array} : (i64) -> !llvm.ptr - %39 = llvm.mlir.zero : !llvm.ptr - llvm.store %7, %39 : i32, !llvm.ptr - %40 = llvm.mlir.constant(9 : i32) : i32 - %41 = llvm.mlir.zero : !llvm.ptr - %42 = llvm.getelementptr %41[1] : (!llvm.ptr) -> !llvm.ptr, i32 - %43 = llvm.ptrtoint %42 : !llvm.ptr to i64 - %44 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %45 = llvm.insertvalue %43, %44[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %46 = llvm.mlir.constant(20180515 : i32) : i32 - %47 = llvm.insertvalue %46, %45[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %48 = llvm.mlir.constant(0 : i32) : i32 - %49 = llvm.trunc %48 : i32 to i8 - %50 = llvm.insertvalue %49, %47[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %51 = llvm.trunc %40 : i32 to i8 - %52 = llvm.insertvalue %51, %50[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %53 = llvm.mlir.constant(2 : i32) : i32 - %54 = llvm.trunc %53 : i32 to i8 - %55 = llvm.insertvalue %54, %52[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %56 = llvm.mlir.constant(0 : i32) : i32 - %57 = llvm.trunc %56 : i32 to i8 - %58 = llvm.insertvalue %57, %55[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %59 = llvm.insertvalue %39, %58[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %59, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %60 = llvm.load %1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %60, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %31 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32 + llvm.store %6, %3 : i32, !llvm.ptr + %32 = llvm.ptrtoint %31 : !llvm.ptr to i64 + %33 = llvm.insertvalue %32, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %34 = llvm.insertvalue %1, %33[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %35 = llvm.trunc %6 : i32 to i8 + %36 = llvm.insertvalue %35, %34[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %37 = llvm.trunc %4 : i32 to i8 + %38 = llvm.insertvalue %37, %36[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %39 = llvm.trunc %0 : i32 to i8 + %40 = llvm.insertvalue %39, %38[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %41 = llvm.insertvalue %35, %40[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %42 = llvm.insertvalue %3, %41[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %42, %9 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %43 = llvm.load %9 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %43, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr llvm.br ^bb3 ^bb3: // 2 preds: ^bb1, ^bb2 - omp.yield(%10 : !llvm.ptr) + omp.yield(%13 : !llvm.ptr) } combiner { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): %0 = llvm.mlir.constant(1 : i32) : i32 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %4 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %4, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %5 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %5, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %6 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr - %8 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr - %10 = llvm.load %7 : !llvm.ptr -> i32 - %11 = llvm.load %9 : !llvm.ptr -> i32 - %12 = llvm.add %10, %11 : i32 - llvm.store %12, %7 : i32, !llvm.ptr + %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %4 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %4, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %5 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %6 = llvm.load %5 : !llvm.ptr -> !llvm.ptr + %7 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr + %9 = llvm.load %6 : !llvm.ptr -> i32 + %10 = llvm.load %8 : !llvm.ptr -> i32 + %11 = llvm.add %9, %10 : i32 + llvm.store %11, %6 : i32, !llvm.ptr omp.yield(%arg0 : !llvm.ptr) } cleanup { ^bb0(%arg0: !llvm.ptr): - %0 = llvm.mlir.constant(1 : i32) : i32 - %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %2 = llvm.mlir.constant(0 : i64) : i64 + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(1 : i32) : i32 + %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %3, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %4 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> %5 = llvm.load %4 : !llvm.ptr -> !llvm.ptr %6 = llvm.ptrtoint %5 : !llvm.ptr to i64 - %7 = llvm.icmp "ne" %6, %2 : i64 + %7 = llvm.icmp "ne" %6, %0 : i64 llvm.cond_br %7, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - //llvm.call @free(%5) : (!llvm.ptr) -> () llvm.br ^bb2 ^bb2: // 2 preds: ^bb0, ^bb1 omp.yield } llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} { - %0 = llvm.mlir.constant(24 : i32) : i32 - %1 = llvm.mlir.constant(42 : i32) : i32 + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.mlir.constant(24 : i32) : i32 + %2 = llvm.mlir.constant(42 : i32) : i32 omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) { - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %4 = llvm.mlir.constant(1 : i32) : i32 - %5 = llvm.alloca %4 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %6 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %6, %5 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %7 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr - llvm.store %1, %8 : i32, !llvm.ptr - %9 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %9, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %10 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr - llvm.store %0, %11 : i32, !llvm.ptr + %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %4 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %5 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %5, %4 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %6 = llvm.getelementptr %4[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr + llvm.store %2, %7 : i32, !llvm.ptr + %8 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + llvm.store %8, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr + %9 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr + llvm.store %1, %10 : i32, !llvm.ptr omp.terminator } llvm.return } - // CHECK: %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8 // CHECK: br label %[[VAL_1:.*]] // CHECK: entry: ; preds = %[[VAL_2:.*]] From 91076aabd9d29752c490a1ef79f9ac9c38a10583 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 20 Jun 2024 10:02:14 +0000 Subject: [PATCH 4/4] Minimize test --- .../openmp-parallel-reduction-multiblock.mlir | 303 +++--------------- 1 file changed, 51 insertions(+), 252 deletions(-) diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir index 4952b15287f81..f4d599538ac4a 100644 --- a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir +++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-multiblock.mlir @@ -1,127 +1,28 @@ // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s -// generated by flang-new: -// subroutine missordered_blocks(x,y) -// integer, allocatable :: x, y -// !$omp parallel reduction(+:x,y) -// x = 42 -// y = 24 -// !$omp end parallel -// end subroutine - // This is basically a test that we don't crash while translating this IR omp.declare_reduction @add_reduction_byref_box_heap_i32 : !llvm.ptr init { ^bb0(%arg0: !llvm.ptr): - %0 = llvm.mlir.constant(2 : i32) : i32 - %1 = llvm.mlir.constant(20180515 : i32) : i32 - %2 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %3 = llvm.mlir.zero : !llvm.ptr - %4 = llvm.mlir.constant(9 : i32) : i32 - %5 = llvm.mlir.constant(1 : i64) : i64 - %6 = llvm.mlir.constant(0 : i32) : i32 %7 = llvm.mlir.constant(0 : i64) : i64 - %8 = llvm.mlir.constant(1 : i32) : i32 - %9 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %10 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %11 = llvm.alloca %8 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %12 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %12, %11 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %13 = llvm.alloca %5 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr - %14 = llvm.getelementptr %11[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %15 = llvm.load %14 : !llvm.ptr -> !llvm.ptr - %16 = llvm.ptrtoint %15 : !llvm.ptr to i64 + %16 = llvm.ptrtoint %arg0 : !llvm.ptr to i64 %17 = llvm.icmp "eq" %16, %7 : i64 llvm.cond_br %17, ^bb1, ^bb2 ^bb1: // pred: ^bb0 - %18 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32 - %19 = llvm.ptrtoint %18 : !llvm.ptr to i64 - %20 = llvm.insertvalue %19, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %21 = llvm.insertvalue %1, %20[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %22 = llvm.trunc %6 : i32 to i8 - %23 = llvm.insertvalue %22, %21[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %24 = llvm.trunc %4 : i32 to i8 - %25 = llvm.insertvalue %24, %23[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %26 = llvm.trunc %0 : i32 to i8 - %27 = llvm.insertvalue %26, %25[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %28 = llvm.insertvalue %22, %27[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %29 = llvm.insertvalue %15, %28[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %29, %10 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %30 = llvm.load %10 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %30, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr llvm.br ^bb3 ^bb2: // pred: ^bb0 - %31 = llvm.getelementptr %3[1] : (!llvm.ptr) -> !llvm.ptr, i32 - llvm.store %6, %3 : i32, !llvm.ptr - %32 = llvm.ptrtoint %31 : !llvm.ptr to i64 - %33 = llvm.insertvalue %32, %2[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %34 = llvm.insertvalue %1, %33[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %35 = llvm.trunc %6 : i32 to i8 - %36 = llvm.insertvalue %35, %34[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %37 = llvm.trunc %4 : i32 to i8 - %38 = llvm.insertvalue %37, %36[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %39 = llvm.trunc %0 : i32 to i8 - %40 = llvm.insertvalue %39, %38[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %41 = llvm.insertvalue %35, %40[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %42 = llvm.insertvalue %3, %41[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %42, %9 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %43 = llvm.load %9 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %43, %13 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr llvm.br ^bb3 ^bb3: // 2 preds: ^bb1, ^bb2 - omp.yield(%13 : !llvm.ptr) + omp.yield(%arg0 : !llvm.ptr) } combiner { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): - %0 = llvm.mlir.constant(1 : i32) : i32 - %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %4 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %4, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %5 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %6 = llvm.load %5 : !llvm.ptr -> !llvm.ptr - %7 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %8 = llvm.load %7 : !llvm.ptr -> !llvm.ptr - %9 = llvm.load %6 : !llvm.ptr -> i32 - %10 = llvm.load %8 : !llvm.ptr -> i32 - %11 = llvm.add %9, %10 : i32 - llvm.store %11, %6 : i32, !llvm.ptr omp.yield(%arg0 : !llvm.ptr) } cleanup { ^bb0(%arg0: !llvm.ptr): - %0 = llvm.mlir.constant(0 : i64) : i64 - %1 = llvm.mlir.constant(1 : i32) : i32 - %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %3, %2 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %4 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %5 = llvm.load %4 : !llvm.ptr -> !llvm.ptr - %6 = llvm.ptrtoint %5 : !llvm.ptr to i64 - %7 = llvm.icmp "ne" %6, %0 : i64 - llvm.cond_br %7, ^bb1, ^bb2 -^bb1: // pred: ^bb0 - llvm.br ^bb2 -^bb2: // 2 preds: ^bb0, ^bb1 omp.yield } llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: !llvm.ptr {fir.bindc_name = "y"}) attributes {fir.internal_name = "_QPmissordered_blocks", frame_pointer = #llvm.framePointerKind<"non-leaf">, target_cpu = "generic", target_features = #llvm.target_features<["+outline-atomics", "+v8a", "+fp-armv8", "+neon"]>} { - %0 = llvm.mlir.constant(1 : i32) : i32 - %1 = llvm.mlir.constant(24 : i32) : i32 - %2 = llvm.mlir.constant(42 : i32) : i32 omp.parallel reduction(byref @add_reduction_byref_box_heap_i32 %arg0 -> %arg2 : !llvm.ptr, byref @add_reduction_byref_box_heap_i32 %arg1 -> %arg3 : !llvm.ptr) { - %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %4 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr - %5 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %5, %4 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %6 = llvm.getelementptr %4[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr - llvm.store %2, %7 : i32, !llvm.ptr - %8 = llvm.load %arg3 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - llvm.store %8, %3 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr - %9 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> - %10 = llvm.load %9 : !llvm.ptr -> !llvm.ptr - llvm.store %1, %10 : i32, !llvm.ptr omp.terminator } llvm.return @@ -159,162 +60,60 @@ llvm.func @missordered_blocks_(%arg0: !llvm.ptr {fir.bindc_name = "x"}, %arg1: ! // CHECK: omp.reduction.init: ; preds = %[[VAL_24:.*]] // CHECK: br label %[[VAL_25:.*]] // CHECK: omp.reduction.neutral: ; preds = %[[VAL_23]] -// CHECK: %[[VAL_26:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_27:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_28:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_29:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_29]], ptr %[[VAL_28]], align 8 -// CHECK: %[[VAL_30:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 -// CHECK: %[[VAL_31:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_28]], i32 0, i32 0 -// CHECK: %[[VAL_32:.*]] = load ptr, ptr %[[VAL_31]], align 8 -// CHECK: %[[VAL_33:.*]] = ptrtoint ptr %[[VAL_32]] to i64 -// CHECK: %[[VAL_34:.*]] = icmp eq i64 %[[VAL_33]], 0 -// CHECK: br i1 %[[VAL_34]], label %[[VAL_35:.*]], label %[[VAL_36:.*]] +// CHECK: %[[VAL_26:.*]] = ptrtoint ptr %[[VAL_13]] to i64 +// CHECK: %[[VAL_27:.*]] = icmp eq i64 %[[VAL_26]], 0 +// CHECK: br i1 %[[VAL_27]], label %[[VAL_28:.*]], label %[[VAL_29:.*]] // CHECK: omp.reduction.neutral2: ; preds = %[[VAL_25]] -// CHECK: store i32 0, ptr null, align 4 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_26]], align 8 -// CHECK: %[[VAL_37:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_26]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_37]], ptr %[[VAL_30]], align 8 +// CHECK: br label %[[VAL_30:.*]] +// CHECK: omp.reduction.neutral3: ; preds = %[[VAL_28]], %[[VAL_29]] +// CHECK: br label %[[VAL_31:.*]] +// CHECK: omp.region.cont: ; preds = %[[VAL_30]] +// CHECK: %[[VAL_32:.*]] = phi ptr [ %[[VAL_13]], %[[VAL_30]] ] +// CHECK: store ptr %[[VAL_32]], ptr %[[VAL_20]], align 8 +// CHECK: br label %[[VAL_33:.*]] +// CHECK: omp.reduction.neutral5: ; preds = %[[VAL_31]] +// CHECK: %[[VAL_34:.*]] = ptrtoint ptr %[[VAL_15]] to i64 +// CHECK: %[[VAL_35:.*]] = icmp eq i64 %[[VAL_34]], 0 +// CHECK: br i1 %[[VAL_35]], label %[[VAL_36:.*]], label %[[VAL_37:.*]] +// CHECK: omp.reduction.neutral7: ; preds = %[[VAL_33]] // CHECK: br label %[[VAL_38:.*]] -// CHECK: omp.reduction.neutral3: ; preds = %[[VAL_35]], %[[VAL_36]] +// CHECK: omp.reduction.neutral8: ; preds = %[[VAL_36]], %[[VAL_37]] // CHECK: br label %[[VAL_39:.*]] -// CHECK: omp.region.cont: ; preds = %[[VAL_38]] -// CHECK: %[[VAL_40:.*]] = phi ptr [ %[[VAL_30]], %[[VAL_38]] ] -// CHECK: store ptr %[[VAL_40]], ptr %[[VAL_20]], align 8 +// CHECK: omp.region.cont4: ; preds = %[[VAL_38]] +// CHECK: %[[VAL_40:.*]] = phi ptr [ %[[VAL_15]], %[[VAL_38]] ] +// CHECK: store ptr %[[VAL_40]], ptr %[[VAL_21]], align 8 // CHECK: br label %[[VAL_41:.*]] -// CHECK: omp.reduction.neutral5: ; preds = %[[VAL_39]] -// CHECK: %[[VAL_42:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_43:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_44:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_45:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_45]], ptr %[[VAL_44]], align 8 -// CHECK: %[[VAL_46:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 -// CHECK: %[[VAL_47:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_44]], i32 0, i32 0 -// CHECK: %[[VAL_48:.*]] = load ptr, ptr %[[VAL_47]], align 8 -// CHECK: %[[VAL_49:.*]] = ptrtoint ptr %[[VAL_48]] to i64 -// CHECK: %[[VAL_50:.*]] = icmp eq i64 %[[VAL_49]], 0 -// CHECK: br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]] -// CHECK: omp.reduction.neutral7: ; preds = %[[VAL_41]] -// CHECK: store i32 0, ptr null, align 4 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_42]], align 8 -// CHECK: %[[VAL_53:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_42]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_53]], ptr %[[VAL_46]], align 8 -// CHECK: br label %[[VAL_54:.*]] -// CHECK: omp.reduction.neutral8: ; preds = %[[VAL_51]], %[[VAL_52]] -// CHECK: br label %[[VAL_55:.*]] -// CHECK: omp.region.cont4: ; preds = %[[VAL_54]] -// CHECK: %[[VAL_56:.*]] = phi ptr [ %[[VAL_46]], %[[VAL_54]] ] -// CHECK: store ptr %[[VAL_56]], ptr %[[VAL_21]], align 8 -// CHECK: br label %[[VAL_57:.*]] -// CHECK: omp.par.region: ; preds = %[[VAL_55]] -// CHECK: br label %[[VAL_58:.*]] -// CHECK: omp.par.region10: ; preds = %[[VAL_57]] -// CHECK: %[[VAL_59:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_60:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_61:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_40]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_61]], ptr %[[VAL_60]], align 8 -// CHECK: %[[VAL_62:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_60]], i32 0, i32 0 -// CHECK: %[[VAL_63:.*]] = load ptr, ptr %[[VAL_62]], align 8 -// CHECK: store i32 42, ptr %[[VAL_63]], align 4 -// CHECK: %[[VAL_64:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_56]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_64]], ptr %[[VAL_59]], align 8 -// CHECK: %[[VAL_65:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_59]], i32 0, i32 0 -// CHECK: %[[VAL_66:.*]] = load ptr, ptr %[[VAL_65]], align 8 -// CHECK: store i32 24, ptr %[[VAL_66]], align 4 -// CHECK: br label %[[VAL_67:.*]] -// CHECK: omp.region.cont9: ; preds = %[[VAL_58]] -// CHECK: %[[VAL_68:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 0 -// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_68]], align 8 -// CHECK: %[[VAL_69:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 1 -// CHECK: store ptr %[[VAL_21]], ptr %[[VAL_69]], align 8 -// CHECK: %[[VAL_70:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) -// CHECK: %[[VAL_71:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_70]], i32 2, i64 16, ptr %[[VAL_22]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var) -// CHECK: switch i32 %[[VAL_71]], label %[[VAL_72:.*]] [ -// CHECK: i32 1, label %[[VAL_73:.*]] -// CHECK: i32 2, label %[[VAL_74:.*]] +// CHECK: omp.par.region: ; preds = %[[VAL_39]] +// CHECK: br label %[[VAL_42:.*]] +// CHECK: omp.par.region10: ; preds = %[[VAL_41]] +// CHECK: br label %[[VAL_43:.*]] +// CHECK: omp.region.cont9: ; preds = %[[VAL_42]] +// CHECK: %[[VAL_44:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 0 +// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_44]], align 8 +// CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_22]], i64 0, i64 1 +// CHECK: store ptr %[[VAL_21]], ptr %[[VAL_45]], align 8 +// CHECK: %[[VAL_46:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[VAL_47:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_46]], i32 2, i64 16, ptr %[[VAL_22]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var) +// CHECK: switch i32 %[[VAL_47]], label %[[VAL_48:.*]] [ +// CHECK: i32 1, label %[[VAL_49:.*]] +// CHECK: i32 2, label %[[VAL_50:.*]] // CHECK: ] -// CHECK: reduce.switch.atomic: ; preds = %[[VAL_67]] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_43]] // CHECK: unreachable -// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_67]] -// CHECK: %[[VAL_75:.*]] = load ptr, ptr %[[VAL_20]], align 8 -// CHECK: %[[VAL_76:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_77:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_78:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_13]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_78]], ptr %[[VAL_77]], align 8 -// CHECK: %[[VAL_79:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_75]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_79]], ptr %[[VAL_76]], align 8 -// CHECK: %[[VAL_80:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_77]], i32 0, i32 0 -// CHECK: %[[VAL_81:.*]] = load ptr, ptr %[[VAL_80]], align 8 -// CHECK: %[[VAL_82:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_76]], i32 0, i32 0 -// CHECK: %[[VAL_83:.*]] = load ptr, ptr %[[VAL_82]], align 8 -// CHECK: %[[VAL_84:.*]] = load i32, ptr %[[VAL_81]], align 4 -// CHECK: %[[VAL_85:.*]] = load i32, ptr %[[VAL_83]], align 4 -// CHECK: %[[VAL_86:.*]] = add i32 %[[VAL_84]], %[[VAL_85]] -// CHECK: store i32 %[[VAL_86]], ptr %[[VAL_81]], align 4 -// CHECK: %[[VAL_87:.*]] = load ptr, ptr %[[VAL_21]], align 8 -// CHECK: %[[VAL_88:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_89:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_90:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_15]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_90]], ptr %[[VAL_89]], align 8 -// CHECK: %[[VAL_91:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_87]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_91]], ptr %[[VAL_88]], align 8 -// CHECK: %[[VAL_92:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_89]], i32 0, i32 0 -// CHECK: %[[VAL_93:.*]] = load ptr, ptr %[[VAL_92]], align 8 -// CHECK: %[[VAL_94:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_88]], i32 0, i32 0 -// CHECK: %[[VAL_95:.*]] = load ptr, ptr %[[VAL_94]], align 8 -// CHECK: %[[VAL_96:.*]] = load i32, ptr %[[VAL_93]], align 4 -// CHECK: %[[VAL_97:.*]] = load i32, ptr %[[VAL_95]], align 4 -// CHECK: %[[VAL_98:.*]] = add i32 %[[VAL_96]], %[[VAL_97]] -// CHECK: store i32 %[[VAL_98]], ptr %[[VAL_93]], align 4 -// CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_70]], ptr @.gomp_critical_user_.reduction.var) -// CHECK: br label %[[VAL_72]] -// CHECK: reduce.finalize: ; preds = %[[VAL_73]], %[[VAL_67]] -// CHECK: br label %[[VAL_99:.*]] -// CHECK: omp.par.pre_finalize: ; preds = %[[VAL_72]] -// CHECK: %[[VAL_100:.*]] = load ptr, ptr %[[VAL_20]], align 8 -// CHECK: br label %[[VAL_101:.*]] -// CHECK: omp.reduction.cleanup: ; preds = %[[VAL_99]] -// CHECK: %[[VAL_102:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_103:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_100]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_103]], ptr %[[VAL_102]], align 8 -// CHECK: %[[VAL_104:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_102]], i32 0, i32 0 -// CHECK: %[[VAL_105:.*]] = load ptr, ptr %[[VAL_104]], align 8 -// CHECK: %[[VAL_106:.*]] = ptrtoint ptr %[[VAL_105]] to i64 -// CHECK: %[[VAL_107:.*]] = icmp ne i64 %[[VAL_106]], 0 -// CHECK: br i1 %[[VAL_107]], label %[[VAL_108:.*]], label %[[VAL_109:.*]] -// CHECK: omp.reduction.cleanup14: ; preds = %[[VAL_108]], %[[VAL_101]] -// CHECK: br label %[[VAL_110:.*]] -// CHECK: omp.region.cont12: ; preds = %[[VAL_109]] -// CHECK: %[[VAL_111:.*]] = load ptr, ptr %[[VAL_21]], align 8 -// CHECK: br label %[[VAL_112:.*]] -// CHECK: omp.reduction.cleanup16: ; preds = %[[VAL_110]] -// CHECK: %[[VAL_113:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8 -// CHECK: %[[VAL_114:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_111]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_114]], ptr %[[VAL_113]], align 8 -// CHECK: %[[VAL_115:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_113]], i32 0, i32 0 -// CHECK: %[[VAL_116:.*]] = load ptr, ptr %[[VAL_115]], align 8 -// CHECK: %[[VAL_117:.*]] = ptrtoint ptr %[[VAL_116]] to i64 -// CHECK: %[[VAL_118:.*]] = icmp ne i64 %[[VAL_117]], 0 -// CHECK: br i1 %[[VAL_118]], label %[[VAL_119:.*]], label %[[VAL_120:.*]] -// CHECK: omp.reduction.cleanup18: ; preds = %[[VAL_119]], %[[VAL_112]] -// CHECK: br label %[[VAL_121:.*]] -// CHECK: omp.region.cont15: ; preds = %[[VAL_120]] -// CHECK: br label %[[VAL_122:.*]] -// CHECK: omp.reduction.cleanup17: ; preds = %[[VAL_112]] -// CHECK: br label %[[VAL_120]] -// CHECK: omp.reduction.cleanup13: ; preds = %[[VAL_101]] -// CHECK: br label %[[VAL_109]] -// CHECK: omp.reduction.neutral6: ; preds = %[[VAL_41]] -// CHECK: %[[VAL_123:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_48]], 0 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_123]], ptr %[[VAL_43]], align 8 -// CHECK: %[[VAL_124:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_43]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_124]], ptr %[[VAL_46]], align 8 -// CHECK: br label %[[VAL_54]] -// CHECK: omp.reduction.neutral1: ; preds = %[[VAL_25]] -// CHECK: %[[VAL_125:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 2, i8 0 }, ptr %[[VAL_32]], 0 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_125]], ptr %[[VAL_27]], align 8 -// CHECK: %[[VAL_126:.*]] = load { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[VAL_27]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[VAL_126]], ptr %[[VAL_30]], align 8 +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_43]] +// CHECK: %[[VAL_51:.*]] = load ptr, ptr %[[VAL_20]], align 8 +// CHECK: %[[VAL_52:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_46]], ptr @.gomp_critical_user_.reduction.var) +// CHECK: br label %[[VAL_48]] +// CHECK: reduce.finalize: ; preds = %[[VAL_49]], %[[VAL_43]] +// CHECK: br label %[[VAL_53:.*]] +// CHECK: omp.par.pre_finalize: ; preds = %[[VAL_48]] +// CHECK: %[[VAL_54:.*]] = load ptr, ptr %[[VAL_20]], align 8 +// CHECK: %[[VAL_55:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_56:.*]] +// CHECK: omp.reduction.neutral6: ; preds = %[[VAL_33]] // CHECK: br label %[[VAL_38]] -// CHECK: omp.par.outlined.exit.exitStub: ; preds = %[[VAL_121]] +// CHECK: omp.reduction.neutral1: ; preds = %[[VAL_25]] +// CHECK: br label %[[VAL_30]] +// CHECK: omp.par.outlined.exit.exitStub: ; preds = %[[VAL_53]] // CHECK: ret void