-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[ARM] Fix undefined behaviour in bf16->float conversion #116985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This was implementing the float->bf16 conversion function using a left-shift of a signed integer, so for negative floating-point values a 1 was being shifted into the sign bit of the signed integer intermediate value. This is undefined behaviour, and was caught by UBSan. The vector versions are code-generated via Neon builtin functions, so probably don't have the same UB problem, but I've updated them anyway to be consistent. Fixes llvm#61983
@llvm/pr-subscribers-clang Author: Oliver Stannard (ostannard) ChangesThis was implementing the bf16->float conversion function using a left-shift of a signed integer, so for negative floating-point values a 1 was being shifted into the sign bit of the signed integer intermediate value. This is undefined behaviour, and was caught by UBSan. The vector versions are code-generated via Neon builtin functions, so probably don't have the same UB problem, but I've updated them anyway to be consistent. Fixes #61983. Full diff: https://github.com/llvm/llvm-project/pull/116985.diff 2 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ec829f566ef5fc..ef89fa4358dfeb 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -252,7 +252,7 @@ def OP_BFMLALT_LN
def OP_VCVT_F32_BF16
: Op<(bitcast "R",
- (call "vshll_n", (bitcast "int16x4_t", $p0),
+ (call "vshll_n", (bitcast "uint16x4_t", $p0),
(literal "int32_t", "16")))>;
def OP_VCVT_F32_BF16_LO
: Op<(call "vcvt_f32_bf16", (call "vget_low", $p0))>;
@@ -275,8 +275,8 @@ def OP_VCVT_BF16_F32_HI_A32
(call "vget_low", $p0))>;
def OP_CVT_F32_BF16
- : Op<(bitcast "R", (op "<<", (cast "int32_t", (bitcast "int16_t", $p0)),
- (literal "int32_t", "16")))>;
+ : Op<(bitcast "R", (op "<<", (cast "uint32_t", (bitcast "uint16_t", $p0)),
+ (literal "uint32_t", "16")))>;
//===----------------------------------------------------------------------===//
// Auxiliary Instructions
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index e2be98c086853e..51aa5aa758f0c3 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -24,50 +24,50 @@
// CHECK-A64-LABEL: @test_vcvt_f32_bf16(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
-// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A64-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
+// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 16
-// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 16
+// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 16
+// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 16
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_836_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I]], ptr [[__REINT_836_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_808_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I]], ptr [[__REINT_808_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP4]]
//
float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
@@ -76,39 +76,39 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
-// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
+// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -132,15 +132,15 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]]
//
float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
@@ -149,39 +149,39 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
-// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
+// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
// CHECK-A64-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
// CHECK-A32-SOFTFP-NEXT: entry:
// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT: [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -205,15 +205,15 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]]
//
float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
@@ -427,7 +427,7 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
// CHECK-NEXT: [[__REINT1_I:%.*]] = alloca i32, align 4
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
-// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
// CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
// CHECK-NEXT: store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4
|
This was implementing the bf16->float conversion function using a left-shift of a signed integer, so for negative floating-point values a 1 was being shifted into the sign bit of the signed integer intermediate value. This is undefined behaviour, and was caught by UBSan.
The vector versions are code-generated via Neon builtin functions, so probably don't have the same UB problem, but I've updated them anyway to be consistent.
Fixes #61983.