From dd6b82da1ff12c36e5c5d1c61bada24b38033c44 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sat, 15 Feb 2025 18:05:34 -0500 Subject: [PATCH] [NFC][AMDGPU] Auto generate check lines for two test cases - `CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll` - `CodeGen/AMDGPU/call-preserved-registers.ll` This is to make preparation for another PR. --- .../AMDGPU/call-preserved-registers.ll | 823 +++++++++++++----- .../spill_more_than_wavesize_csr_sgprs.ll | 319 ++++++- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 813 +++++++++++++---- 3 files changed, 1569 insertions(+), 386 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index ff80e05197b0d..db9ce56ecc3cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s @@ -5,110 +6,258 @@ declare hidden void @external_void_func_void() #3 -; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[34:35] -; GCN-NEXT: s_add_u32 s34, s34, -; GCN-NEXT: s_addc_u32 s35, s35, -; GCN: s_swappc_b64 s[30:31], s[34:35] - -; GCN-NEXT: #ASMSTART -; GCN-NEXT: #ASMEND -; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; FLATSCR-LABEL: test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_endpgm call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; GCN: v_writelane_b32 v40, s30, 0 -; GCN: v_writelane_b32 v40, s31, 1 -; GCN: v_writelane_b32 v40, s34, 2 -; GCN: v_writelane_b32 v40, s35, 3 - -; GCN: s_swappc_b64 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s35, v40, 3 -; GCN: v_readlane_b32 s34, v40, 2 -; MUBUF-DAG: v_readlane_b32 s31, v40, 1 -; MUBUF-DAG: v_readlane_b32 s30, v40, 0 -; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 -; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword -; FLATSCR: scratch_load_dword -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; MUBUF-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_store_dword v40 -; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 - -; GCN: s_swappc_b64 -; GCN-NEXT: s_swappc_b64 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword v40 -; FLATSCR: scratch_load_dword v40 -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { +; MUBUF-LABEL: test_func_call_external_void_funcx2: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_funcx2: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: -; GCN: s_waitcnt -; GCN: v_writelane_b32 v0, s30, 0 -; GCN: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: #ASMSTART -; GCN: ; clobber -; GCN-NEXT: #ASMEND -; GCN: v_readlane_b32 s31, v0, 1 -; GCN: v_readlane_b32 s30, v0, 0 -; GCN: s_setpc_b64 s[30:31] define void @void_func_void_clobber_s30_s31() #2 { +; MUBUF-LABEL: void_func_void_clobber_s30_s31: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v0, s31, 1 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s30_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v0, s31, 1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_vcc: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden void @void_func_void_clobber_vcc() #2 { +; GCN-LABEL: void_func_void_clobber_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{vcc}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b64 s[34:35], vcc -; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 s8, s4, 8 +; FLATSCR-NEXT: s_addc_u32 s9, s5, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() call void @void_func_void_clobber_vcc() %val0 = load volatile i32, ptr addrspace(1) undef @@ -117,22 +266,50 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s33, s31 -; GCN: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: s_mov_b32 s31, s33 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v40, v31 -; GCN: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_v31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_mov_b32_e32 v31, v40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -140,175 +317,294 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace } ; FIXME: What is the expected behavior for reserved registers here? - -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 - -; GCN: #ASMSTART -; GCN-NEXT: ; def s33 -; GCN-NEXT: #ASMEND - -; GCN-NOT: s33 - -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] -; MUBUF: s_swappc_b64 s[30:31], s[4:5] - -; GCN-NOT: s33 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use s33 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} -; GCN-NOT: s34 - -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def s34 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: s34 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: s34 - -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s34 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} - -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v40 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: v40 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v40 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_v40: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s33: -; GCN: v_writelane_b32 v0, s33, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s33, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s33() #2 { +; MUBUF-LABEL: void_func_void_clobber_s33: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s33, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s33, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s33, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s33, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s34: -; GCN: v_writelane_b32 v0, s34, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s34, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s34() #2 { +; MUBUF-LABEL: void_func_void_clobber_s34: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s34, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s34, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s34, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s34, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s33() ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s34() ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v40, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v40 -; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s40, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -316,16 +612,92 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { } ; First call preserved VGPR is used so it can't be used for SGPR spills. -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v41, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v41 -; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_vgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v41, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v41, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v41, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: v_readlane_b32 s40, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v41, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v41, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v41, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: v_readlane_b32 s40, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() @@ -334,15 +706,30 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v32 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v32 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index d2b960fe43f84..0d6bccad89d82 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -1,13 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs: -; CHECK-DAG: v_writelane_b32 v0, s98, 63 -; CHECK-DAG: v_writelane_b32 v1, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v1, 0 -; CHECK-DAG: v_readlane_b32 s98, v0, 63 - define void @spill_more_than_wavesize_csr_sgprs() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, s35, 0 +; CHECK-NEXT: v_writelane_b32 v0, s36, 1 +; CHECK-NEXT: v_writelane_b32 v0, s37, 2 +; CHECK-NEXT: v_writelane_b32 v0, s38, 3 +; CHECK-NEXT: v_writelane_b32 v0, s39, 4 +; CHECK-NEXT: v_writelane_b32 v0, s40, 5 +; CHECK-NEXT: v_writelane_b32 v0, s41, 6 +; CHECK-NEXT: v_writelane_b32 v0, s42, 7 +; CHECK-NEXT: v_writelane_b32 v0, s43, 8 +; CHECK-NEXT: v_writelane_b32 v0, s44, 9 +; CHECK-NEXT: v_writelane_b32 v0, s45, 10 +; CHECK-NEXT: v_writelane_b32 v0, s46, 11 +; CHECK-NEXT: v_writelane_b32 v0, s47, 12 +; CHECK-NEXT: v_writelane_b32 v0, s48, 13 +; CHECK-NEXT: v_writelane_b32 v0, s49, 14 +; CHECK-NEXT: v_writelane_b32 v0, s50, 15 +; CHECK-NEXT: v_writelane_b32 v0, s51, 16 +; CHECK-NEXT: v_writelane_b32 v0, s52, 17 +; CHECK-NEXT: v_writelane_b32 v0, s53, 18 +; CHECK-NEXT: v_writelane_b32 v0, s54, 19 +; CHECK-NEXT: v_writelane_b32 v0, s55, 20 +; CHECK-NEXT: v_writelane_b32 v0, s56, 21 +; CHECK-NEXT: v_writelane_b32 v0, s57, 22 +; CHECK-NEXT: v_writelane_b32 v0, s58, 23 +; CHECK-NEXT: v_writelane_b32 v0, s59, 24 +; CHECK-NEXT: v_writelane_b32 v0, s60, 25 +; CHECK-NEXT: v_writelane_b32 v0, s61, 26 +; CHECK-NEXT: v_writelane_b32 v0, s62, 27 +; CHECK-NEXT: v_writelane_b32 v0, s63, 28 +; CHECK-NEXT: v_writelane_b32 v0, s64, 29 +; CHECK-NEXT: v_writelane_b32 v0, s65, 30 +; CHECK-NEXT: v_writelane_b32 v0, s66, 31 +; CHECK-NEXT: v_writelane_b32 v0, s67, 32 +; CHECK-NEXT: v_writelane_b32 v0, s68, 33 +; CHECK-NEXT: v_writelane_b32 v0, s69, 34 +; CHECK-NEXT: v_writelane_b32 v0, s70, 35 +; CHECK-NEXT: v_writelane_b32 v0, s71, 36 +; CHECK-NEXT: v_writelane_b32 v0, s72, 37 +; CHECK-NEXT: v_writelane_b32 v0, s73, 38 +; CHECK-NEXT: v_writelane_b32 v0, s74, 39 +; CHECK-NEXT: v_writelane_b32 v0, s75, 40 +; CHECK-NEXT: v_writelane_b32 v0, s76, 41 +; CHECK-NEXT: v_writelane_b32 v0, s77, 42 +; CHECK-NEXT: v_writelane_b32 v0, s78, 43 +; CHECK-NEXT: v_writelane_b32 v0, s79, 44 +; CHECK-NEXT: v_writelane_b32 v0, s80, 45 +; CHECK-NEXT: v_writelane_b32 v0, s81, 46 +; CHECK-NEXT: v_writelane_b32 v0, s82, 47 +; CHECK-NEXT: v_writelane_b32 v0, s83, 48 +; CHECK-NEXT: v_writelane_b32 v0, s84, 49 +; CHECK-NEXT: v_writelane_b32 v0, s85, 50 +; CHECK-NEXT: v_writelane_b32 v0, s86, 51 +; CHECK-NEXT: v_writelane_b32 v0, s87, 52 +; CHECK-NEXT: v_writelane_b32 v0, s88, 53 +; CHECK-NEXT: v_writelane_b32 v0, s89, 54 +; CHECK-NEXT: v_writelane_b32 v0, s90, 55 +; CHECK-NEXT: v_writelane_b32 v0, s91, 56 +; CHECK-NEXT: v_writelane_b32 v0, s92, 57 +; CHECK-NEXT: v_writelane_b32 v0, s93, 58 +; CHECK-NEXT: v_writelane_b32 v0, s94, 59 +; CHECK-NEXT: v_writelane_b32 v0, s95, 60 +; CHECK-NEXT: v_writelane_b32 v1, s99, 0 +; CHECK-NEXT: v_writelane_b32 v0, s96, 61 +; CHECK-NEXT: v_writelane_b32 v1, s100, 1 +; CHECK-NEXT: v_writelane_b32 v0, s97, 62 +; CHECK-NEXT: v_writelane_b32 v1, s101, 2 +; CHECK-NEXT: v_writelane_b32 v0, s98, 63 +; CHECK-NEXT: v_writelane_b32 v1, s102, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v1, 3 +; CHECK-NEXT: v_readlane_b32 s101, v1, 2 +; CHECK-NEXT: v_readlane_b32 s100, v1, 1 +; CHECK-NEXT: v_readlane_b32 s99, v1, 0 +; CHECK-NEXT: v_readlane_b32 s98, v0, 63 +; CHECK-NEXT: v_readlane_b32 s97, v0, 62 +; CHECK-NEXT: v_readlane_b32 s96, v0, 61 +; CHECK-NEXT: v_readlane_b32 s95, v0, 60 +; CHECK-NEXT: v_readlane_b32 s94, v0, 59 +; CHECK-NEXT: v_readlane_b32 s93, v0, 58 +; CHECK-NEXT: v_readlane_b32 s92, v0, 57 +; CHECK-NEXT: v_readlane_b32 s91, v0, 56 +; CHECK-NEXT: v_readlane_b32 s90, v0, 55 +; CHECK-NEXT: v_readlane_b32 s89, v0, 54 +; CHECK-NEXT: v_readlane_b32 s88, v0, 53 +; CHECK-NEXT: v_readlane_b32 s87, v0, 52 +; CHECK-NEXT: v_readlane_b32 s86, v0, 51 +; CHECK-NEXT: v_readlane_b32 s85, v0, 50 +; CHECK-NEXT: v_readlane_b32 s84, v0, 49 +; CHECK-NEXT: v_readlane_b32 s83, v0, 48 +; CHECK-NEXT: v_readlane_b32 s82, v0, 47 +; CHECK-NEXT: v_readlane_b32 s81, v0, 46 +; CHECK-NEXT: v_readlane_b32 s80, v0, 45 +; CHECK-NEXT: v_readlane_b32 s79, v0, 44 +; CHECK-NEXT: v_readlane_b32 s78, v0, 43 +; CHECK-NEXT: v_readlane_b32 s77, v0, 42 +; CHECK-NEXT: v_readlane_b32 s76, v0, 41 +; CHECK-NEXT: v_readlane_b32 s75, v0, 40 +; CHECK-NEXT: v_readlane_b32 s74, v0, 39 +; CHECK-NEXT: v_readlane_b32 s73, v0, 38 +; CHECK-NEXT: v_readlane_b32 s72, v0, 37 +; CHECK-NEXT: v_readlane_b32 s71, v0, 36 +; CHECK-NEXT: v_readlane_b32 s70, v0, 35 +; CHECK-NEXT: v_readlane_b32 s69, v0, 34 +; CHECK-NEXT: v_readlane_b32 s68, v0, 33 +; CHECK-NEXT: v_readlane_b32 s67, v0, 32 +; CHECK-NEXT: v_readlane_b32 s66, v0, 31 +; CHECK-NEXT: v_readlane_b32 s65, v0, 30 +; CHECK-NEXT: v_readlane_b32 s64, v0, 29 +; CHECK-NEXT: v_readlane_b32 s63, v0, 28 +; CHECK-NEXT: v_readlane_b32 s62, v0, 27 +; CHECK-NEXT: v_readlane_b32 s61, v0, 26 +; CHECK-NEXT: v_readlane_b32 s60, v0, 25 +; CHECK-NEXT: v_readlane_b32 s59, v0, 24 +; CHECK-NEXT: v_readlane_b32 s58, v0, 23 +; CHECK-NEXT: v_readlane_b32 s57, v0, 22 +; CHECK-NEXT: v_readlane_b32 s56, v0, 21 +; CHECK-NEXT: v_readlane_b32 s55, v0, 20 +; CHECK-NEXT: v_readlane_b32 s54, v0, 19 +; CHECK-NEXT: v_readlane_b32 s53, v0, 18 +; CHECK-NEXT: v_readlane_b32 s52, v0, 17 +; CHECK-NEXT: v_readlane_b32 s51, v0, 16 +; CHECK-NEXT: v_readlane_b32 s50, v0, 15 +; CHECK-NEXT: v_readlane_b32 s49, v0, 14 +; CHECK-NEXT: v_readlane_b32 s48, v0, 13 +; CHECK-NEXT: v_readlane_b32 s47, v0, 12 +; CHECK-NEXT: v_readlane_b32 s46, v0, 11 +; CHECK-NEXT: v_readlane_b32 s45, v0, 10 +; CHECK-NEXT: v_readlane_b32 s44, v0, 9 +; CHECK-NEXT: v_readlane_b32 s43, v0, 8 +; CHECK-NEXT: v_readlane_b32 s42, v0, 7 +; CHECK-NEXT: v_readlane_b32 s41, v0, 6 +; CHECK-NEXT: v_readlane_b32 s40, v0, 5 +; CHECK-NEXT: v_readlane_b32 s39, v0, 4 +; CHECK-NEXT: v_readlane_b32 s38, v0, 3 +; CHECK-NEXT: v_readlane_b32 s37, v0, 2 +; CHECK-NEXT: v_readlane_b32 s36, v0, 1 +; CHECK-NEXT: v_readlane_b32 s35, v0, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42} ,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50} @@ -21,13 +166,161 @@ define void @spill_more_than_wavesize_csr_sgprs() { ret void } -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v1, s35, 0 +; CHECK-NEXT: v_writelane_b32 v1, s36, 1 +; CHECK-NEXT: v_writelane_b32 v1, s37, 2 +; CHECK-NEXT: v_writelane_b32 v1, s38, 3 +; CHECK-NEXT: v_writelane_b32 v1, s39, 4 +; CHECK-NEXT: v_writelane_b32 v1, s40, 5 +; CHECK-NEXT: v_writelane_b32 v1, s41, 6 +; CHECK-NEXT: v_writelane_b32 v1, s42, 7 +; CHECK-NEXT: v_writelane_b32 v1, s43, 8 +; CHECK-NEXT: v_writelane_b32 v1, s44, 9 +; CHECK-NEXT: v_writelane_b32 v1, s45, 10 +; CHECK-NEXT: v_writelane_b32 v1, s46, 11 +; CHECK-NEXT: v_writelane_b32 v1, s47, 12 +; CHECK-NEXT: v_writelane_b32 v1, s48, 13 +; CHECK-NEXT: v_writelane_b32 v1, s49, 14 +; CHECK-NEXT: v_writelane_b32 v1, s50, 15 +; CHECK-NEXT: v_writelane_b32 v1, s51, 16 +; CHECK-NEXT: v_writelane_b32 v1, s52, 17 +; CHECK-NEXT: v_writelane_b32 v1, s53, 18 +; CHECK-NEXT: v_writelane_b32 v1, s54, 19 +; CHECK-NEXT: v_writelane_b32 v1, s55, 20 +; CHECK-NEXT: v_writelane_b32 v1, s56, 21 +; CHECK-NEXT: v_writelane_b32 v1, s57, 22 +; CHECK-NEXT: v_writelane_b32 v1, s58, 23 +; CHECK-NEXT: v_writelane_b32 v1, s59, 24 +; CHECK-NEXT: v_writelane_b32 v1, s60, 25 +; CHECK-NEXT: v_writelane_b32 v1, s61, 26 +; CHECK-NEXT: v_writelane_b32 v1, s62, 27 +; CHECK-NEXT: v_writelane_b32 v1, s63, 28 +; CHECK-NEXT: v_writelane_b32 v1, s64, 29 +; CHECK-NEXT: v_writelane_b32 v1, s65, 30 +; CHECK-NEXT: v_writelane_b32 v1, s66, 31 +; CHECK-NEXT: v_writelane_b32 v1, s67, 32 +; CHECK-NEXT: v_writelane_b32 v1, s68, 33 +; CHECK-NEXT: v_writelane_b32 v1, s69, 34 +; CHECK-NEXT: v_writelane_b32 v1, s70, 35 +; CHECK-NEXT: v_writelane_b32 v1, s71, 36 +; CHECK-NEXT: v_writelane_b32 v1, s72, 37 +; CHECK-NEXT: v_writelane_b32 v1, s73, 38 +; CHECK-NEXT: v_writelane_b32 v1, s74, 39 +; CHECK-NEXT: v_writelane_b32 v1, s75, 40 +; CHECK-NEXT: v_writelane_b32 v1, s76, 41 +; CHECK-NEXT: v_writelane_b32 v1, s77, 42 +; CHECK-NEXT: v_writelane_b32 v1, s78, 43 +; CHECK-NEXT: v_writelane_b32 v1, s79, 44 +; CHECK-NEXT: v_writelane_b32 v1, s80, 45 +; CHECK-NEXT: v_writelane_b32 v1, s81, 46 +; CHECK-NEXT: v_writelane_b32 v1, s82, 47 +; CHECK-NEXT: v_writelane_b32 v1, s83, 48 +; CHECK-NEXT: v_writelane_b32 v1, s84, 49 +; CHECK-NEXT: v_writelane_b32 v1, s85, 50 +; CHECK-NEXT: v_writelane_b32 v1, s86, 51 +; CHECK-NEXT: v_writelane_b32 v1, s87, 52 +; CHECK-NEXT: v_writelane_b32 v1, s88, 53 +; CHECK-NEXT: v_writelane_b32 v1, s89, 54 +; CHECK-NEXT: v_writelane_b32 v1, s90, 55 +; CHECK-NEXT: v_writelane_b32 v1, s91, 56 +; CHECK-NEXT: v_writelane_b32 v1, s92, 57 +; CHECK-NEXT: v_writelane_b32 v1, s93, 58 +; CHECK-NEXT: v_writelane_b32 v1, s94, 59 +; CHECK-NEXT: v_writelane_b32 v1, s95, 60 +; CHECK-NEXT: v_writelane_b32 v2, s99, 0 +; CHECK-NEXT: v_writelane_b32 v1, s96, 61 +; CHECK-NEXT: v_writelane_b32 v2, s100, 1 +; CHECK-NEXT: v_writelane_b32 v1, s97, 62 +; CHECK-NEXT: v_writelane_b32 v2, s101, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s98, 63 +; CHECK-NEXT: v_writelane_b32 v2, s102, 3 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v2, 3 +; CHECK-NEXT: v_readlane_b32 s101, v2, 2 +; CHECK-NEXT: v_readlane_b32 s100, v2, 1 +; CHECK-NEXT: v_readlane_b32 s99, v2, 0 +; CHECK-NEXT: v_readlane_b32 s98, v1, 63 +; CHECK-NEXT: v_readlane_b32 s97, v1, 62 +; CHECK-NEXT: v_readlane_b32 s96, v1, 61 +; CHECK-NEXT: v_readlane_b32 s95, v1, 60 +; CHECK-NEXT: v_readlane_b32 s94, v1, 59 +; CHECK-NEXT: v_readlane_b32 s93, v1, 58 +; CHECK-NEXT: v_readlane_b32 s92, v1, 57 +; CHECK-NEXT: v_readlane_b32 s91, v1, 56 +; CHECK-NEXT: v_readlane_b32 s90, v1, 55 +; CHECK-NEXT: v_readlane_b32 s89, v1, 54 +; CHECK-NEXT: v_readlane_b32 s88, v1, 53 +; CHECK-NEXT: v_readlane_b32 s87, v1, 52 +; CHECK-NEXT: v_readlane_b32 s86, v1, 51 +; CHECK-NEXT: v_readlane_b32 s85, v1, 50 +; CHECK-NEXT: v_readlane_b32 s84, v1, 49 +; CHECK-NEXT: v_readlane_b32 s83, v1, 48 +; CHECK-NEXT: v_readlane_b32 s82, v1, 47 +; CHECK-NEXT: v_readlane_b32 s81, v1, 46 +; CHECK-NEXT: v_readlane_b32 s80, v1, 45 +; CHECK-NEXT: v_readlane_b32 s79, v1, 44 +; CHECK-NEXT: v_readlane_b32 s78, v1, 43 +; CHECK-NEXT: v_readlane_b32 s77, v1, 42 +; CHECK-NEXT: v_readlane_b32 s76, v1, 41 +; CHECK-NEXT: v_readlane_b32 s75, v1, 40 +; CHECK-NEXT: v_readlane_b32 s74, v1, 39 +; CHECK-NEXT: v_readlane_b32 s73, v1, 38 +; CHECK-NEXT: v_readlane_b32 s72, v1, 37 +; CHECK-NEXT: v_readlane_b32 s71, v1, 36 +; CHECK-NEXT: v_readlane_b32 s70, v1, 35 +; CHECK-NEXT: v_readlane_b32 s69, v1, 34 +; CHECK-NEXT: v_readlane_b32 s68, v1, 33 +; CHECK-NEXT: v_readlane_b32 s67, v1, 32 +; CHECK-NEXT: v_readlane_b32 s66, v1, 31 +; CHECK-NEXT: v_readlane_b32 s65, v1, 30 +; CHECK-NEXT: v_readlane_b32 s64, v1, 29 +; CHECK-NEXT: v_readlane_b32 s63, v1, 28 +; CHECK-NEXT: v_readlane_b32 s62, v1, 27 +; CHECK-NEXT: v_readlane_b32 s61, v1, 26 +; CHECK-NEXT: v_readlane_b32 s60, v1, 25 +; CHECK-NEXT: v_readlane_b32 s59, v1, 24 +; CHECK-NEXT: v_readlane_b32 s58, v1, 23 +; CHECK-NEXT: v_readlane_b32 s57, v1, 22 +; CHECK-NEXT: v_readlane_b32 s56, v1, 21 +; CHECK-NEXT: v_readlane_b32 s55, v1, 20 +; CHECK-NEXT: v_readlane_b32 s54, v1, 19 +; CHECK-NEXT: v_readlane_b32 s53, v1, 18 +; CHECK-NEXT: v_readlane_b32 s52, v1, 17 +; CHECK-NEXT: v_readlane_b32 s51, v1, 16 +; CHECK-NEXT: v_readlane_b32 s50, v1, 15 +; CHECK-NEXT: v_readlane_b32 s49, v1, 14 +; CHECK-NEXT: v_readlane_b32 s48, v1, 13 +; CHECK-NEXT: v_readlane_b32 s47, v1, 12 +; CHECK-NEXT: v_readlane_b32 s46, v1, 11 +; CHECK-NEXT: v_readlane_b32 s45, v1, 10 +; CHECK-NEXT: v_readlane_b32 s44, v1, 9 +; CHECK-NEXT: v_readlane_b32 s43, v1, 8 +; CHECK-NEXT: v_readlane_b32 s42, v1, 7 +; CHECK-NEXT: v_readlane_b32 s41, v1, 6 +; CHECK-NEXT: v_readlane_b32 s40, v1, 5 +; CHECK-NEXT: v_readlane_b32 s39, v1, 4 +; CHECK-NEXT: v_readlane_b32 s38, v1, 3 +; CHECK-NEXT: v_readlane_b32 s37, v1, 2 +; CHECK-NEXT: v_readlane_b32 s36, v1, 1 +; CHECK-NEXT: v_readlane_b32 s35, v1, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "", diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index fed60eecc8a8b..0e568e3071e99 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -8,92 +8,168 @@ ; 4 byte emergency stack slot ; = 144 bytes with padding between them -; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 -; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[SCALED_IDX]], [[FRAMEDIFF]] - -; GCN-NOT: s32 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN-NOT: s32 - -; GCN: ; ScratchSize: 144 define void @needs_align16_default_stack_align(i32 %idx) #0 { +; GCN-LABEL: needs_align16_default_stack_align: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 144 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align16_stack_align4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { +; GCN-LABEL: needs_align16_stack_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x3c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x2800 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 160 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align32: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { +; GCN-LABEL: needs_align32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x7c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 192 %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 ret void } -; GCN-LABEL: {{^}}force_realign4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 -; GCN: s_addk_i32 s32, 0xd00{{$}} - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { +; GCN-LABEL: force_realign4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffff00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0xd00 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 3 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 52 %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile i32 3, ptr addrspace(5) %gep0, align 4 ret void } -; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_movk_i32 s32, 0x400{{$}} -; GCN-NOT: s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { +; GCN-LABEL: kernel_call_align16_from_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 2, ptr addrspace(5) %alloca call void @needs_align16_default_stack_align(i32 1) @@ -101,10 +177,32 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { } ; The call sequence should keep the stack on call aligned to 4 -; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { +; GCN-LABEL: kernel_call_align16_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -112,10 +210,32 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { ret void } -; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { +; GCN-LABEL: kernel_call_align4_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_stack_align4@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_stack_align4@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -123,28 +243,36 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ret void } -; GCN-LABEL: {{^}}default_realign_align128: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_mov_b32 s5, s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_addk_i32 s32, 0x4000 -; GCN-NOT: s33 -; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_mov_b32 s32, s34 -; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { +; GCN-LABEL: default_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } -; GCN-LABEL: {{^}}disable_realign_align128: -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { +; GCN-LABEL: disable_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void @@ -156,35 +284,48 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; since there is a local object with an alignment of 1024. ; Should use BP to access the incoming stack arguments. ; The BP value is saved/restored with a VGPR spill. - ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: s_swappc_b64 s[30:31], +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 3 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 -; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: s_mov_b32 s32, s34 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 1024 call void @extern_func(<32 x i32> %a, i32 %b) @@ -198,23 +339,56 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; index variable, the base pointer first get loaded into a VGPR ; and that value should be further referenced to load the incoming values. ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. - ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 -; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 -; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 -; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen -; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s32, s34 -; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: ; %begin +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s11, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s34 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: s_branch .LBB10_2 +; GCN-NEXT: .LBB10_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %loop_body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execz .LBB10_1 +; GCN-NEXT: ; %bb.3: ; %loop_end +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_add_i32 s10, s10, 1 +; GCN-NEXT: s_cmp_eq_u32 s10, 9 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GCN-NEXT: s_branch .LBB10_1 +; GCN-NEXT: .LBB10_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s33, s11 +; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %local_var, align 1024 @@ -239,16 +413,31 @@ exit: ; preds = %loop_end, %loop_b define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: -; GCN: ; %bb.0: -; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 vcc_lo, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, vcc_lo +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. @@ -262,15 +451,172 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; If there are no free SGPRs or VGPRs available we must spill the BP to memory. - -; GCN-LABEL: no_free_regs_spill_bp_to_mem -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 s[6:7], -1 -; GCN: buffer_store_dword v39, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN: buffer_store_dword v0, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, s34 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 +; GCN-LABEL: no_free_regs_spill_bp_to_memory: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -297,22 +643,179 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. - -; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: s_add_i32 s5, s33, 0x42100 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1080 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_add_i32 s32, s32, 0x46000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128