From 340ddba33a7f02ed7735143a730db2d6064e734a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 9 Oct 2024 17:35:30 +0100 Subject: [PATCH 1/2] [AMDGPU] Disable -amdgpu-codegenprepare-widen-16-bit-ops by default Fixes #64591 --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 70 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 163 ++++--- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 149 +++++-- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 77 +--- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 169 +++---- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 88 ++-- .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll | 54 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 26 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 33 +- llvm/test/CodeGen/AMDGPU/anyext.ll | 8 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 7 +- .../CodeGen/AMDGPU/calling-conventions.ll | 413 +++++++++--------- llvm/test/CodeGen/AMDGPU/ctlz.ll | 28 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 6 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 15 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 34 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 13 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 +- llvm/test/CodeGen/AMDGPU/min.ll | 260 ++++++----- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 19 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 6 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 290 ++++++------ llvm/test/CodeGen/AMDGPU/sra.ll | 79 ++-- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 34 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +- 27 files changed, 977 insertions(+), 1089 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 052e1140533f3..d1654f8daea9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -49,7 +49,7 @@ static cl::opt Widen16BitOps( "amdgpu-codegenprepare-widen-16-bit-ops", cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), cl::ReallyHidden, - cl::init(true)); + cl::init(false)); static cl::opt BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c8b82716a9fe1..9c25a07bc8dc3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_addk_i32 s1, 0xffc0 +; GFX8-NEXT: s_addk_i32 s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 +; GFX8-NEXT: s_addk_i32 s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_addk_i32 s1, 0xffc0 ; GFX8-NEXT: s_add_i32 s0, s0, 4 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi: @@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16: @@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs: @@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_rhs: @@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 63f5464371cc6..58ae28bc48f4a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -79,22 +79,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { ; ; GFX8-LABEL: s_ashr_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s1 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i8 s1, s1 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, %amount @@ -102,15 +110,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { } define amdgpu_ps i8 @s_ashr_i8_7(i8 inreg %value) { -; GCN-LABEL: s_ashr_i8_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_sext_i32_i8 s0, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 7 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_ashr_i8_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 7 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_ashr_i8_7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_ashr_i8_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 7 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8_7: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, 7 @@ -652,21 +675,21 @@ define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) { ; GFX8-LABEL: s_ashr_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ashr_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i16 s1, s1 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount @@ -827,14 +850,16 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_ashr_i32 s2, s2, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s1, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1029,23 +1054,27 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s6, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s7, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_ashr_i32 s4, s4, s6 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s5, s7 +; GFX8-NEXT: s_sext_i32_i16 s2, s4 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_ashr_i32 s2, s2, s6 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s3, s4, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1236,41 +1265,49 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s9, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s12, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s13, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s10, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s14, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s9, s13 +; GFX8-NEXT: s_sext_i32_i16 s4, s8 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_ashr_i32 s4, s4, s12 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s11, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s15, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s10, s14 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_ashr_i32 s5, s5, s13 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: s_ashr_i32 s8, s8, s12 -; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_ashr_i32 s6, s6, s14 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s7, s8, 0xffff +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_ashr_i32 s7, s7, s15 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 5dd4fa0809131..6ebd8c6146095 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -69,15 +69,36 @@ define i8 @v_lshr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { -; GCN-LABEL: s_lshr_i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, %amount @@ -85,14 +106,30 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { } define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { -; GCN-LABEL: s_lshr_i8_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8_7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 7 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 7 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8_7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result @@ -619,15 +656,30 @@ define i16 @v_lshr_i16_15(i16 %value) { } define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { -; GCN-LABEL: s_lshr_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount @@ -635,14 +687,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { } define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { -; GCN-LABEL: s_lshr_i16_15: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 15 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result @@ -783,13 +848,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v2i16: @@ -970,21 +1035,21 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, s6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: @@ -1155,37 +1220,37 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s8, s12 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 ; GFX8-NEXT: s_lshr_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 42f1bf84c0420..306b5579bebbc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -7,37 +7,18 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { -; GFX7-LABEL: s_mul_i16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -93,35 +74,27 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_zeroext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_zeroext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -170,42 +143,22 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { } define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { -; GFX7-LABEL: s_mul_i16_signext: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16_signext: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16_signext: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_signext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_signext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_sext_i32_i16 s0, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index bac80f0777c02..fc852aa416cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -40,30 +40,14 @@ define i8 @v_sext_inreg_i8_7(i8 %value) { } define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 3 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 3 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x50000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x50000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 3 %ashr = ashr i8 %shl, 3 @@ -71,30 +55,14 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { } define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8_6: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8_6: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 6 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 6 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 6 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 6 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x20000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8_6: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x20000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 6 %ashr = ashr i8 %shl, 6 @@ -585,16 +553,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { ; ; GFX9-LABEL: s_sext_inreg_i16_9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 9 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x70000 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_9: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x70000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 9 %ashr = ashr i16 %shl, 9 @@ -616,16 +580,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) { ; ; GFX9-LABEL: s_sext_inreg_i16_15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 15 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 15 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 15 %ashr = ashr i16 %shl, 15 @@ -720,15 +680,16 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { ; GFX8-LABEL: s_sext_inreg_v2i16_11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 11 ; GFX8-NEXT: s_lshl_b32 s1, s1, 11 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, 11 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, 11 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, 11 +; GFX8-NEXT: s_ashr_i32 s0, s0, 11 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_v2i16_11: @@ -854,25 +815,27 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshl_b32 s2, s2, 14 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 14 -; GFX8-NEXT: s_lshl_b32 s3, s3, 14 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s0, 14 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 14 -; GFX8-NEXT: s_ashr_i32 s1, s1, 14 +; GFX8-NEXT: s_lshl_b32 s3, s3, 14 +; GFX8-NEXT: s_ashr_i32 s0, s0, 14 +; GFX8-NEXT: s_lshl_b32 s1, s1, 14 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_ashr_i32 s3, s3, 14 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_ashr_i32 s1, s1, 14 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_v4i16_14: @@ -1068,45 +1031,49 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { ; GFX8-LABEL: s_sext_inreg_v8i16_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshl_b32 s4, s4, 5 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 5 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_ashr_i32 s4, s4, 5 +; GFX8-NEXT: s_lshl_b32 s5, s5, 5 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_ashr_i32 s0, s0, 5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 5 -; GFX8-NEXT: s_lshl_b32 s5, s5, 5 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_ashr_i32 s5, s5, 5 +; GFX8-NEXT: s_lshl_b32 s6, s6, 5 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_ashr_i32 s1, s1, 5 ; GFX8-NEXT: s_lshl_b32 s2, s2, 5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 5 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_ashr_i32 s0, s0, 5 -; GFX8-NEXT: s_ashr_i32 s4, s4, 5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 5 -; GFX8-NEXT: s_lshl_b32 s7, s7, 5 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_ashr_i32 s1, s1, 5 -; GFX8-NEXT: s_ashr_i32 s5, s5, 5 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_ashr_i32 s6, s6, 5 +; GFX8-NEXT: s_lshl_b32 s7, s7, 5 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_ashr_i32 s2, s2, 5 -; GFX8-NEXT: s_ashr_i32 s6, s6, 5 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_ashr_i32 s3, s3, 5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 5 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_ashr_i32 s7, s7, 5 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_ashr_i32 s3, s3, 5 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_v8i16_5: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 183f2edbf9035..218d487aee413 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -637,13 +637,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; ; GFX8-LABEL: s_shl_v2i32_zext_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x3fff -; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i32_zext_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 4cf1c92539c36..c7603b7cec04a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -71,19 +71,22 @@ define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) { ; ; GFX8-LABEL: s_shl_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount @@ -627,19 +630,19 @@ define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) { ; ; GFX8-LABEL: s_shl_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount @@ -791,13 +794,14 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i16: @@ -976,21 +980,23 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s4, s6 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v4i16: @@ -1157,37 +1163,41 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s8, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshl_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 ; GFX8-NEXT: s_lshl_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index 855687281ce9a..49ba01aaf9e4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -244,12 +244,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffffffc0 +; GFX8-NEXT: s_sub_i32 s0, s0, 0xffffffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat: @@ -284,12 +284,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_sub_i32 s1, s1, 4 +; GFX8-NEXT: s_sub_i32 s0, s0, 0xffffffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo: @@ -324,12 +324,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffffffc0 ; GFX8-NEXT: s_sub_i32 s0, s0, 4 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi: @@ -365,14 +365,13 @@ define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_sub_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16: @@ -412,14 +411,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs: @@ -463,14 +461,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_rhs: @@ -516,14 +513,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 6bb4e2d3dbe26..7c9d8cba0fbb2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -35,15 +35,8 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX8-LABEL: scalar_xnor_v2i16_one_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_xor_b32 s0, s0, -1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: @@ -129,21 +122,10 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 4cc384e9d2718..bcd75255acef4 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -103,13 +103,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_add_i32 s1, s2, s0 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s2, 16 ; VI-NEXT: s_add_i32 s2, s2, s0 -; VI-NEXT: s_add_i32 s1, s1, s3 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s0, s2, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -170,16 +170,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_add_i32 s0, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -230,12 +229,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 ; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s5, s4 ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_lshl_b32 s3, s5, 16 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 8b6c8be9f3788..115cb40676da8 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 6f52da2631b8a..89735592cfa8a 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -50,8 +50,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s4, 0xffff -; GISEL-NEXT: s_brev_b32 s2, s2 +; GISEL-NEXT: s_brev_b32 s2, s4 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -80,11 +79,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 29770738f83d5..e9ddc801b050c 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -980,7 +980,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; VI-LABEL: ps_mesa_inreg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_and_b32 s0, 0xffff, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[0:1], v0 @@ -988,9 +987,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; GFX11-LABEL: ps_mesa_inreg_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 @@ -1140,20 +1138,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s0, 24 -; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_bfe_u32 s3, s0, 0x80008 +; VI-NEXT: s_add_i32 s2, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_and_b32 s1, s1, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 24 -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_or_b32 s1, s1, s2 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s3, 8 ; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1227,8 +1225,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: s_bfe_u32 s2, s0, 0x80008 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v2, s1 @@ -1308,22 +1306,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 4 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_lshr_b32 s2, s0, 24 -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 ; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s4, s4, s4 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: s_or_b32 s0, s0, s2 @@ -1423,37 +1420,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_lshr_b32 s2, s1, 24 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 +; VI-NEXT: s_lshr_b32 s4, s1, 16 +; VI-NEXT: s_lshr_b32 s5, s1, 24 +; VI-NEXT: s_bfe_u32 s6, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s7, s1, 0x80008 +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_bfe_u32 s6, s1, 0x80008 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s3, s6, 8 -; VI-NEXT: s_lshr_b32 s4, s0, 24 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_bfe_u32 s7, s0, 0x80008 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s3, s5, 0xff ; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s4, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s5, s7, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s3, s7, 8 +; VI-NEXT: s_lshl_b32 s3, s6, 8 +; VI-NEXT: s_or_b32 s1, s1, s5 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s1, s1, s4 ; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1595,69 +1592,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s4, s3, 24 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s5, s0, 24 +; VI-NEXT: s_lshr_b32 s6, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s1, 24 +; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s9, s2, 24 +; VI-NEXT: s_lshr_b32 s10, s3, 16 +; VI-NEXT: s_lshr_b32 s11, s3, 24 +; VI-NEXT: s_bfe_u32 s12, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s13, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s14, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s15, s3, 0x80008 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 +; VI-NEXT: s_add_i32 s9, s9, s9 +; VI-NEXT: s_add_i32 s8, s8, s8 +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s6, s6, s6 ; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_bfe_u32 s12, s3, 0x80008 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s2, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_lshl_b32 s5, s12, 8 -; VI-NEXT: s_lshr_b32 s6, s2, 24 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s3, s3, s5 -; VI-NEXT: s_bfe_u32 s13, s2, 0x80008 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s5, s7, 0xff -; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s4 -; VI-NEXT: s_lshl_b32 s4, s6, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s9, s1, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s5, s13, 8 -; VI-NEXT: s_lshr_b32 s8, s1, 24 -; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_or_b32 s2, s2, s5 -; VI-NEXT: s_bfe_u32 s14, s1, 0x80008 -; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s5, s9, 0xff ; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: s_lshl_b32 s4, s8, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s11, s0, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s5, s14, 8 -; VI-NEXT: s_lshr_b32 s10, s0, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s1, s1, s5 -; VI-NEXT: s_bfe_u32 s15, s0, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s5, s11, 0xff -; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s4 -; VI-NEXT: s_lshl_b32 s4, s10, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s11, s15, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s9, s14, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s7, s13, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_lshl_b32 s5, s12, 8 +; VI-NEXT: s_or_b32 s3, s3, s11 +; VI-NEXT: s_or_b32 s2, s2, s9 +; VI-NEXT: s_or_b32 s1, s1, s7 ; VI-NEXT: s_or_b32 s0, s0, s5 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s10 +; VI-NEXT: s_or_b32 s2, s2, s8 +; VI-NEXT: s_or_b32 s1, s1, s6 ; VI-NEXT: s_or_b32 s0, s0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1904,138 +1901,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 16 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: s_lshr_b32 s8, s3, 24 +; VI-NEXT: s_lshr_b32 s8, s4, 16 +; VI-NEXT: s_lshr_b32 s9, s4, 24 +; VI-NEXT: s_lshr_b32 s10, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 24 +; VI-NEXT: s_lshr_b32 s12, s6, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 24 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s7, 24 +; VI-NEXT: s_bfe_u32 s24, s4, 0x80008 +; VI-NEXT: s_bfe_u32 s25, s5, 0x80008 +; VI-NEXT: s_bfe_u32 s26, s6, 0x80008 +; VI-NEXT: s_bfe_u32 s27, s7, 0x80008 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 ; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_bfe_u32 s24, s3, 0x80008 ; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshr_b32 s16, s0, 16 +; VI-NEXT: s_lshr_b32 s17, s0, 24 +; VI-NEXT: s_lshr_b32 s18, s1, 16 +; VI-NEXT: s_lshr_b32 s19, s1, 24 +; VI-NEXT: s_lshr_b32 s20, s2, 16 +; VI-NEXT: s_lshr_b32 s21, s2, 24 +; VI-NEXT: s_lshr_b32 s22, s3, 16 +; VI-NEXT: s_lshr_b32 s23, s3, 24 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s24, s24, s24 -; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s8, s8, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s11, s2, 16 +; VI-NEXT: s_bfe_u32 s28, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s29, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s30, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s31, s3, 0x80008 +; VI-NEXT: s_add_i32 s23, s23, s23 +; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_add_i32 s21, s21, s21 +; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_add_i32 s19, s19, s19 +; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_add_i32 s17, s17, s17 +; VI-NEXT: s_add_i32 s16, s16, s16 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s15, s27, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s11, s25, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s9, s24, 8 -; VI-NEXT: s_lshr_b32 s10, s2, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s3, s3, s9 -; VI-NEXT: s_bfe_u32 s25, s2, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s9, s11, 0xff -; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_add_i32 s31, s31, s31 +; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s13, s1, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_lshr_b32 s12, s1, 24 -; VI-NEXT: s_add_i32 s13, s13, s13 -; VI-NEXT: s_or_b32 s2, s2, s9 -; VI-NEXT: s_bfe_u32 s26, s1, 0x80008 -; VI-NEXT: s_add_i32 s12, s12, s12 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s9, s13, 0xff -; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_add_i32 s30, s30, s30 +; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s8 -; VI-NEXT: s_lshl_b32 s8, s12, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s15, s0, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s9, s26, 8 -; VI-NEXT: s_lshr_b32 s14, s0, 24 -; VI-NEXT: s_add_i32 s15, s15, s15 -; VI-NEXT: s_or_b32 s1, s1, s9 -; VI-NEXT: s_bfe_u32 s27, s0, 0x80008 -; VI-NEXT: s_add_i32 s14, s14, s14 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s9, s15, 0xff -; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_add_i32 s29, s29, s29 +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s8 -; VI-NEXT: s_lshl_b32 s8, s14, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s17, s7, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s9, s27, 8 -; VI-NEXT: s_lshr_b32 s16, s7, 24 -; VI-NEXT: s_add_i32 s17, s17, s17 -; VI-NEXT: s_or_b32 s0, s0, s9 -; VI-NEXT: s_bfe_u32 s28, s7, 0x80008 -; VI-NEXT: s_add_i32 s16, s16, s16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s9, s17, 0xff ; VI-NEXT: s_add_i32 s28, s28, s28 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s0, s0, s8 -; VI-NEXT: s_lshl_b32 s8, s16, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s19, s6, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s28, 8 -; VI-NEXT: s_lshr_b32 s18, s6, 24 -; VI-NEXT: s_add_i32 s19, s19, s19 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_bfe_u32 s29, s6, 0x80008 -; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_or_b32 s7, s7, s15 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s5, s5, s11 +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s23, s31, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s21, s30, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s19, s29, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s17, s28, 8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s9, s19, 0xff -; VI-NEXT: s_add_i32 s29, s29, s29 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_lshl_b32 s8, s18, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s21, s5, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s9, s29, 8 -; VI-NEXT: s_lshr_b32 s20, s5, 24 -; VI-NEXT: s_add_i32 s21, s21, s21 -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_bfe_u32 s30, s5, 0x80008 -; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s9, s21, 0xff -; VI-NEXT: s_add_i32 s30, s30, s30 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_lshl_b32 s8, s20, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s23, s4, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s9, s30, 8 -; VI-NEXT: s_lshr_b32 s22, s4, 24 -; VI-NEXT: s_add_i32 s23, s23, s23 -; VI-NEXT: s_or_b32 s5, s5, s9 -; VI-NEXT: s_bfe_u32 s31, s4, 0x80008 -; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_and_b32 s9, s23, 0xff -; VI-NEXT: s_add_i32 s31, s31, s31 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s22, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s9, s31, 8 -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s23 +; VI-NEXT: s_or_b32 s2, s2, s21 +; VI-NEXT: s_or_b32 s1, s1, s19 +; VI-NEXT: s_or_b32 s0, s0, s17 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_or_b32 s5, s5, s10 ; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_or_b32 s3, s3, s22 +; VI-NEXT: s_or_b32 s2, s2, s20 +; VI-NEXT: s_or_b32 s1, s1, s18 +; VI-NEXT: s_or_b32 s0, s0, s16 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 93e14a205f05d..389d00c7e95ff 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1650,15 +1650,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1696,11 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -1711,11 +1702,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_sub_nc_u16 v2, v2, 16 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -1727,13 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4588bee49f037..1d3b308f346fc 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -657,8 +657,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm @@ -767,8 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ccd23a91c3573..2ae4bf0a6ceec 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -1463,9 +1455,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 4c7c8bc1c027d..966619a090d28 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -629,8 +629,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm @@ -731,8 +730,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm @@ -1460,13 +1458,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -1503,14 +1496,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm @@ -1558,19 +1550,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_readfirstlane_b32 s2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshl_b32 s2, s2, 8 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s3, s2, 0x10000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_ff1_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, s3, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1613,8 +1598,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index e447429539e6f..9c3f5f1cd672d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -635,12 +635,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_xor_b32 s2, s4, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -721,11 +716,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 -; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_f16_e64 v1, 2.0, s4 +; VI-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index a4bde5c9d8215..c06a3dab32982 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s2, 0xffff -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s2, s2, s5 -; VI-NEXT: s_lshr_b32 s3, s4, s3 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshr_b32 s4, s5, s4 +; VI-NEXT: s_lshr_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 05ef2698c1f77..7754a23ed8007 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -438,12 +438,12 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s2 -; VI-NEXT: s_sext_i32_i8 s3, s3 -; VI-NEXT: s_min_i32 s2, s2, s3 +; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 +; VI-NEXT: s_bfe_i32 s3, s3, 0x80000 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_min_i16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -454,10 +454,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NEXT: s_min_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_min_i16_e32 v1, s2, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -469,10 +469,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i8 s2, s2 -; GFX10-NEXT: s_sext_i32_i8 s3, s3 -; GFX10-NEXT: s_min_i32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX10-NEXT: v_min_i16 v1, s2, s3 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -484,11 +483,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-NEXT: s_sext_i32_i8 s3, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 -; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_bfe_i32 s2, s4, 0x80000 +; GFX11-NEXT: s_bfe_i32 s3, s5, 0x80000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_min_i16 v1, s2, s3 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -593,30 +591,33 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s4, s2, 24 -; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 -; VI-NEXT: s_bfe_i32 s6, s2, 0x80008 -; VI-NEXT: s_sext_i32_i8 s2, s2 -; VI-NEXT: s_ashr_i32 s7, s3, 24 -; VI-NEXT: s_bfe_i32 s8, s3, 0x80010 -; VI-NEXT: s_bfe_i32 s9, s3, 0x80008 -; VI-NEXT: s_sext_i32_i8 s3, s3 -; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s3, s6, s9 -; VI-NEXT: s_min_i32 s5, s5, s8 -; VI-NEXT: s_min_i32 s4, s4, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s3, s3, 8 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s4 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_ashr_i32 s6, s2, 24 +; VI-NEXT: s_lshr_b32 s8, s3, 16 +; VI-NEXT: s_ashr_i32 s9, s3, 24 +; VI-NEXT: s_bfe_i32 s8, s8, 0x80000 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: s_bfe_i32 s5, s5, 0x80000 +; VI-NEXT: s_sext_i32_i16 s7, s3 +; VI-NEXT: v_min_i16_e32 v0, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: s_sext_i32_i16 s4, s2 +; VI-NEXT: s_lshr_b32 s7, s7, 8 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: v_min_i16_e32 v1, s5, v1 +; VI-NEXT: s_lshr_b32 s4, s4, 8 +; VI-NEXT: s_bfe_i32 s3, s3, 0x80000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 +; VI-NEXT: v_min_i16_e32 v1, s4, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_min_i16_e32 v2, s2, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -789,18 +790,16 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s4, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_ashr_i32 s5, s3, 16 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_min_i32 s4, s4, s5 -; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s3, s4, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_min_i16_e32 v0, s2, v0 +; VI-NEXT: v_min_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -953,27 +952,23 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s7, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_min_i32 s6, s6, s8 -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s7, s7, s9 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_lshl_b32 s2, s6, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s7, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshr_b32 s6, s3, 16 +; VI-NEXT: s_lshr_b32 s7, s1, 16 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_min_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_i16_e32 v0, s1, v0 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_min_i16_e32 v2, s0, v2 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2661,19 +2656,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v5, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_ushort v4, v[2:3] +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -2687,7 +2685,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] ; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: global_store_short v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -2703,7 +2701,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] ; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: global_store_short v0, v1, s[8:9] @@ -2716,11 +2714,15 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[4:5] -; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -3174,43 +3176,39 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s11, 16 -; VI-NEXT: s_lshr_b32 s4, s10, 16 -; VI-NEXT: s_and_b32 s5, s10, 0xffff -; VI-NEXT: s_lshr_b32 s10, s15, 16 -; VI-NEXT: s_and_b32 s3, s11, 0xffff -; VI-NEXT: s_and_b32 s11, s15, 0xffff -; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: s_min_u32 s2, s2, s10 -; VI-NEXT: s_lshr_b32 s6, s9, 16 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_lshr_b32 s16, s13, 16 -; VI-NEXT: s_lshr_b32 s17, s12, 16 -; VI-NEXT: s_min_u32 s4, s4, s15 -; VI-NEXT: s_min_u32 s3, s3, s11 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_min_u32 s9, s9, s17 -; VI-NEXT: s_min_u32 s6, s6, s16 -; VI-NEXT: s_min_u32 s5, s5, s14 -; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_lshl_b32 s3, s4, 16 -; VI-NEXT: s_min_u32 s8, s8, s12 -; VI-NEXT: s_min_u32 s7, s7, s13 -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_lshl_b32 s4, s6, 16 -; VI-NEXT: s_lshl_b32 s5, s9, 16 -; VI-NEXT: s_or_b32 s4, s7, s4 -; VI-NEXT: s_or_b32 s5, s8, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s2, s15, 16 +; VI-NEXT: s_lshr_b32 s3, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_min_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_u16_e32 v0, s11, v0 +; VI-NEXT: s_lshr_b32 s2, s14, 16 +; VI-NEXT: s_lshr_b32 s3, s10, 16 +; VI-NEXT: v_or_b32_e32 v3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_min_u16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: v_min_u16_e32 v1, s10, v1 +; VI-NEXT: s_lshr_b32 s2, s13, 16 +; VI-NEXT: s_lshr_b32 s3, s9, 16 +; VI-NEXT: v_or_b32_e32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_min_u16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_min_u16_e32 v1, s9, v1 +; VI-NEXT: s_lshr_b32 s2, s12, 16 +; VI-NEXT: s_lshr_b32 s3, s8, 16 +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_min_u16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_min_u16_e32 v4, s8, v4 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -3536,12 +3534,11 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s3, s2 -; VI-NEXT: s_ashr_i32 s2, s2, 16 -; VI-NEXT: s_min_i32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_min_i16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -3551,10 +3548,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_min_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_min_i16_e32 v1, s2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -3565,10 +3561,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_min_i32 s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GFX10-NEXT: v_min_i16 v1, s2, s3 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -3579,11 +3573,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_ashr_i32 s3, s4, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 -; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_min_i16 v1, s4, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 5a1cc72644d47..b1066e0f8f26a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: s_waitcnt vmcnt(1) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4 -; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2 +; NOSDWA-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm @@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; GFX89-NEXT: v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_mov_b32_e32 v1, s5 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 9b9f03ff74aa3..44dd0b6e27e74 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_lshl_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshl_b32 s0, s1, s0 ; VI-NEXT: s_lshl_b32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 3446e0384cc54..3c7569c4601e8 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -22,23 +22,19 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_sub_i32 s3, 0, s4 -; VI-NEXT: s_ashr_i32 s5, s4, 16 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sub_i32 s2, 0, s2 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_max_i32 s3, s4, s3 -; VI-NEXT: s_max_i32 s2, s5, s2 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_add_i32 s2, s2, 0x20000 +; VI-NEXT: s_sub_i32 s2, 0, s4 +; VI-NEXT: s_lshr_b32 s3, s4, 16 +; VI-NEXT: s_sub_i32 s5, 0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_max_i16_e32 v1, s4, v1 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -171,23 +167,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_sub_i32 s3, 0, s4 -; VI-NEXT: s_ashr_i32 s5, s4, 16 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sub_i32 s2, 0, s2 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_max_i32 s3, s4, s3 -; VI-NEXT: s_max_i32 s2, s5, s2 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_add_i32 s2, s2, 0x20000 +; VI-NEXT: s_sub_i32 s2, 0, s4 +; VI-NEXT: s_lshr_b32 s3, s4, 16 +; VI-NEXT: s_sub_i32 s5, 0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_max_i16_e32 v1, s4, v1 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -331,37 +323,29 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_sub_i32 s6, 0, s3 -; VI-NEXT: s_sub_i32 s7, 0, s2 -; VI-NEXT: s_sub_i32 s5, 0, s5 -; VI-NEXT: s_sub_i32 s4, 0, s4 -; VI-NEXT: s_ashr_i32 s8, s2, 16 -; VI-NEXT: s_ashr_i32 s9, s3, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s5, s5 -; VI-NEXT: s_max_i32 s3, s3, s6 -; VI-NEXT: s_max_i32 s2, s2, s7 -; VI-NEXT: s_max_i32 s5, s9, s5 -; VI-NEXT: s_max_i32 s4, s8, s4 -; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_or_b32 s2, s4, s2 -; VI-NEXT: s_add_i32 s3, s3, 0x20000 -; VI-NEXT: s_add_i32 s2, s2, 0x20000 +; VI-NEXT: s_lshr_b32 s7, s2, 16 +; VI-NEXT: s_lshr_b32 s6, s3, 16 +; VI-NEXT: s_sub_i32 s9, 0, s7 +; VI-NEXT: s_sub_i32 s8, 0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_sub_i32 s5, 0, s2 +; VI-NEXT: v_max_i16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_max_i16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_max_i16_e32 v2, s2, v2 +; VI-NEXT: v_max_i16_e32 v3, s3, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x20000, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x20000, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -559,28 +543,23 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_ashr_i32 s2, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s3, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_max_i16_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_i16_e32 v8, s0, v7 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_max_i32 s4, s2, s3 -; VI-NEXT: s_max_i32 s5, s0, s1 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s0, s0, s1 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_lshl_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_min_i16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_i16_e32 v5, s0, v7 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; @@ -661,12 +640,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_i16_e32 v6, v4, v5 +; VI-NEXT: v_max_i16_sdwa v7, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_i16_e32 v8, v4, v5 +; VI-NEXT: v_min_i16_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v6, v7 +; VI-NEXT: v_or_b32_e32 v4, v8, v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[2:3], v4 @@ -748,40 +727,30 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s0, s7, 16 +; VI-NEXT: s_lshr_b32 s1, s5, 16 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v8, s7 +; VI-NEXT: s_lshr_b32 s0, s6, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_max_i16_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_i16_e32 v5, s5, v8 +; VI-NEXT: v_mov_b32_e32 v9, s0 +; VI-NEXT: v_mov_b32_e32 v10, s1 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_or_b32_e32 v5, v5, v4 +; VI-NEXT: v_max_i16_sdwa v4, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_i16_e32 v12, s4, v11 +; VI-NEXT: v_min_i16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_i16_e32 v7, s5, v8 +; VI-NEXT: v_or_b32_e32 v4, v12, v4 +; VI-NEXT: v_or_b32_e32 v7, v7, v6 +; VI-NEXT: v_min_i16_sdwa v6, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_i16_e32 v8, s4, v11 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_ashr_i32 s0, s5, 16 -; VI-NEXT: s_ashr_i32 s1, s4, 16 -; VI-NEXT: s_sext_i32_i16 s2, s5 -; VI-NEXT: s_sext_i32_i16 s3, s4 -; VI-NEXT: s_ashr_i32 s4, s7, 16 -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_max_i32 s8, s1, s5 -; VI-NEXT: s_max_i32 s9, s0, s4 -; VI-NEXT: s_max_i32 s10, s3, s6 -; VI-NEXT: s_max_i32 s11, s2, s7 -; VI-NEXT: s_min_i32 s0, s0, s4 -; VI-NEXT: s_min_i32 s2, s2, s7 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_min_i32 s1, s1, s5 -; VI-NEXT: s_min_i32 s3, s3, s6 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s9, s11, s9 -; VI-NEXT: s_or_b32 s8, s10, s8 -; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s2, s3, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: s_or_b32 s1, s2, s1 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_mov_b32_e32 v7, s0 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[6:7] @@ -899,42 +868,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_readfirstlane_b32 s0, v4 -; VI-NEXT: v_readfirstlane_b32 s1, v5 -; VI-NEXT: s_ashr_i32 s3, s0, 16 -; VI-NEXT: s_ashr_i32 s5, s1, 16 -; VI-NEXT: s_cmp_gt_i32 s3, s5 -; VI-NEXT: s_sext_i32_i16 s2, s0 -; VI-NEXT: s_sext_i32_i16 s4, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s3, s5 -; VI-NEXT: s_cselect_b32 s3, s5, s3 -; VI-NEXT: s_lshl_b32 s5, s0, 16 -; VI-NEXT: s_cmp_gt_i32 s2, s4 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s2, s4 -; VI-NEXT: s_cselect_b32 s1, s4, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s5 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_ashrrev_i32_e32 v10, 16, v4 +; VI-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; VI-NEXT: v_bfe_i32 v6, v4, 0, 16 +; VI-NEXT: v_bfe_i32 v7, v5, 0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, v10, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], v6, v7 +; VI-NEXT: v_cndmask_b32_e64 v6, v5, v4, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v6 +; VI-NEXT: v_or_b32_e32 v0, v9, v5 +; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 3, v0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_byte v[0:1], v4 +; VI-NEXT: flat_store_byte v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; @@ -1020,27 +981,24 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_max_u32 s5, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_max_u32 s4, s0, s1 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_min_u32 s0, s0, s1 -; VI-NEXT: s_min_u32 s1, s2, s3 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_max_u16_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_u16_e32 v8, s0, v7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_min_u16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_u16_e32 v5, s0, v7 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index cd06a060a50cd..1540c3e5c403f 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v1 -; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_ashr_i32 s2, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: v_readfirstlane_b32 s1, v1 +; VI-NEXT: s_lshr_b32 s2, s1, 16 ; VI-NEXT: s_ashr_i32 s3, s0, 16 ; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s0, s1, s0 -; VI-NEXT: s_ashr_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_ashr_i32 s2, s3, s2 +; VI-NEXT: s_ashr_i32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -282,43 +281,41 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: v_readfirstlane_b32 s1, v3 -; VI-NEXT: v_readfirstlane_b32 s2, v0 -; VI-NEXT: v_readfirstlane_b32 s3, v1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_ashr_i32 s10, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s11, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s0, s2, s0 -; VI-NEXT: s_ashr_i32 s2, s9, s11 -; VI-NEXT: s_ashr_i32 s1, s3, s1 -; VI-NEXT: s_ashr_i32 s3, s8, s10 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_or_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_lshr_b32 s8, s7, 16 +; VI-NEXT: s_ashr_i32 s9, s6, 16 +; VI-NEXT: s_sext_i32_i16 s6, s6 +; VI-NEXT: s_lshr_b32 s10, s5, 16 +; VI-NEXT: s_ashr_i32 s11, s4, 16 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_ashr_i32 s8, s9, s8 +; VI-NEXT: s_ashr_i32 s6, s6, s7 +; VI-NEXT: s_ashr_i32 s7, s11, s10 +; VI-NEXT: s_ashr_i32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 5a821db6ff040..327a85e80da9d 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -117,23 +117,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s6, s[6:7], 0x0 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s2, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_lshr_b32 s5, s7, 16 -; VI-NEXT: s_sub_i32 s6, s6, s7 -; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xffff -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_sub_i32 s1, s2, s0 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_sub_i32 s0, s2, s0 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16: @@ -235,9 +233,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_sub_i32 s0, s1, s0 ; VI-NEXT: s_sub_i32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 7a1f05f56a751..f074f7bf67f77 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -323,7 +323,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s0, 0xffffff00 +; VI-NEXT: s_and_b32 s1, s0, 0xff00 ; VI-NEXT: s_add_i32 s0, s0, 12 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: s_and_b32 s0, s0, 0xff From b363ddea9520ceef803097a7ab58a94f883f8b4a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 9 Oct 2024 17:49:58 +0100 Subject: [PATCH 2/2] clang-format --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index d1654f8daea9d..f689fcf62fe8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -46,10 +46,10 @@ static cl::opt WidenLoads( cl::init(false)); static cl::opt Widen16BitOps( - "amdgpu-codegenprepare-widen-16-bit-ops", - cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, - cl::init(false)); + "amdgpu-codegenprepare-widen-16-bit-ops", + cl::desc( + "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); static cl::opt BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",