2 * Copyright 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 /* To compile this assembly code:
26 * cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3
27 * sp3 nv1x.sp3 -hex nv1x.hex
30 * cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3
31 * sp3 gfx10.sp3 -hex gfx10.hex
34 * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3
35 * sp3 gfx11.sp3 -hex gfx11.hex
38 * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx10.asm -P -o gfx12.sp3
39 * sp3 gfx12.sp3 -hex gfx12.hex
42 #define CHIP_NAVI10 26
43 #define CHIP_SIENNA_CICHLID 30
44 #define CHIP_PLUM_BONITO 36
47 #define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID)
48 #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID)
49 #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
50 #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
51 #define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO && ASIC_FAMILY < CHIP_GFX12)
52 #define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
53 #define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
55 #if ASIC_FAMILY < CHIP_GFX12
56 #define S_COHERENCE glc:1
57 #define V_COHERENCE slc:1 glc:1
58 #define S_WAITCNT_0 s_waitcnt 0
60 #define S_COHERENCE scope:SCOPE_SYS
61 #define V_COHERENCE scope:SCOPE_SYS
62 #define S_WAITCNT_0 s_wait_idle
64 #define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO
65 #define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI
66 #define HW_REG_GPR_ALLOC HW_REG_WAVE_GPR_ALLOC
67 #define HW_REG_LDS_ALLOC HW_REG_WAVE_LDS_ALLOC
68 #define HW_REG_MODE HW_REG_WAVE_MODE
71 #if ASIC_FAMILY < CHIP_GFX12
72 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
73 var SQ_WAVE_STATUS_HALT_MASK = 0x2000
74 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
75 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6
76 var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
77 var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
78 var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8
79 var S_STATUS_HWREG = HW_REG_STATUS
80 var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
81 var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK
82 var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000
83 var S_SAVE_PC_HI_HT_MASK = 0x01000000
85 var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4
86 var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9
87 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00
88 var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000
89 var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000
90 var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15
91 var SQ_WAVE_STATUS_WAVE64_SHIFT = 29
92 var SQ_WAVE_STATUS_WAVE64_SIZE = 1
93 var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9
94 var S_STATUS_HWREG = HW_REG_WAVE_STATE_PRIV
95 var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
96 var S_STATUS_HALT_MASK = SQ_WAVE_STATE_PRIV_HALT_MASK
97 var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000
100 var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24
101 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
102 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
103 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8
104 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
105 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
107 #if ASIC_FAMILY < CHIP_PLUM_BONITO
108 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
110 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12
113 #if ASIC_FAMILY < CHIP_GFX12
114 var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
115 var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF
116 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
117 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80
118 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7
119 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
120 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
121 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
122 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11
123 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000
124 #if ASIC_FAMILY >= CHIP_PLUM_BONITO
125 var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT = 16
126 var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000
127 var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17
128 var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000
129 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000
131 var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000
133 var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12
134 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19
136 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15
137 var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25
138 var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000
139 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
141 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800
143 var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT
144 var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
146 #if ASIC_FAMILY < CHIP_PLUM_BONITO
147 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
148 var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT
149 var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0
150 var S_TRAPSTS_RESTORE_PART_3_SIZE = 0
152 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\
153 SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\
154 SQ_WAVE_TRAPSTS_WAVE_START_MASK |\
155 SQ_WAVE_TRAPSTS_WAVE_END_MASK |\
156 SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK
157 var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
158 var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT
159 var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT
161 var S_TRAPSTS_HWREG = HW_REG_TRAPSTS
162 var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK
163 var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT
165 var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF
166 var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10
167 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5
168 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20
169 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40
170 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6
171 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80
172 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7
173 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100
174 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8
175 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200
176 var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800
177 var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80
178 var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200
180 var S_TRAPSTS_HWREG = HW_REG_WAVE_EXCP_FLAG_PRIV
181 var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
182 var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
183 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\
184 SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\
185 SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\
186 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\
187 SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\
188 SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK
189 var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
190 var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
191 var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
192 var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
193 var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT
194 var BARRIER_STATE_SIGNAL_OFFSET = 16
195 var BARRIER_STATE_VALID_OFFSET = 0
198 // bits [31:24] unused by SPI debug data
199 var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31
200 var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000
201 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24
202 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000
203 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
204 var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
206 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
207 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
208 var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000
209 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC
210 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
211 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
213 var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000
214 var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31
216 var s_sgpr_save_num = 108
218 var s_save_spi_init_lo = exec_lo
219 var s_save_spi_init_hi = exec_hi
220 var s_save_pc_lo = ttmp0
221 var s_save_pc_hi = ttmp1
222 var s_save_exec_lo = ttmp2
223 var s_save_exec_hi = ttmp3
224 var s_save_status = ttmp12
225 var s_save_trapsts = ttmp15
226 var s_save_xnack_mask = s_save_trapsts
227 var s_wave_size = ttmp7
228 var s_save_buf_rsrc0 = ttmp8
229 var s_save_buf_rsrc1 = ttmp9
230 var s_save_buf_rsrc2 = ttmp10
231 var s_save_buf_rsrc3 = ttmp11
232 var s_save_mem_offset = ttmp4
233 var s_save_alloc_size = s_save_trapsts
234 var s_save_tmp = ttmp14
235 var s_save_m0 = ttmp5
236 var s_save_ttmps_lo = s_save_tmp
237 var s_save_ttmps_hi = s_save_trapsts
239 var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
240 var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
242 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
243 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
246 var s_restore_spi_init_lo = exec_lo
247 var s_restore_spi_init_hi = exec_hi
248 var s_restore_mem_offset = ttmp12
249 var s_restore_alloc_size = ttmp3
250 var s_restore_tmp = ttmp2
251 var s_restore_mem_offset_save = s_restore_tmp
252 var s_restore_m0 = s_restore_alloc_size
253 var s_restore_mode = ttmp7
254 var s_restore_flat_scratch = s_restore_tmp
255 var s_restore_pc_lo = ttmp0
256 var s_restore_pc_hi = ttmp1
257 var s_restore_exec_lo = ttmp4
258 var s_restore_exec_hi = ttmp5
259 var s_restore_status = ttmp14
260 var s_restore_trapsts = ttmp15
261 var s_restore_xnack_mask = ttmp13
262 var s_restore_buf_rsrc0 = ttmp8
263 var s_restore_buf_rsrc1 = ttmp9
264 var s_restore_buf_rsrc2 = ttmp10
265 var s_restore_buf_rsrc3 = ttmp11
266 var s_restore_size = ttmp6
267 var s_restore_ttmps_lo = s_restore_tmp
268 var s_restore_ttmps_hi = s_restore_alloc_size
269 var s_restore_spi_init_hi_save = s_restore_exec_hi
276 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
282 s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC
284 // Clear SPI_PRIO: do not save with elevated priority.
285 // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
286 s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK
288 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG)
291 // If ttmp1[30] is set then issue s_barrier to unblock dependent waves.
292 s_bitcmp1_b32 s_save_pc_hi, 30
293 s_cbranch_scc0 L_TRAP_NO_BARRIER
297 // If ttmp1[31] is set then trap may occur early.
298 // Spin wait until SAVECTX exception is raised.
299 s_bitcmp1_b32 s_save_pc_hi, 31
300 s_cbranch_scc1 L_CHECK_SAVE
303 s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK
304 s_cbranch_scc0 L_NOT_HALTED
307 // Host trap may occur while wave is halted.
308 #if ASIC_FAMILY < CHIP_GFX12
309 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
311 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
313 s_cbranch_scc1 L_FETCH_2ND_TRAP
316 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK
317 s_cbranch_scc1 L_SAVE
319 // Wave is halted but neither host trap nor SAVECTX is raised.
320 // Caused by instruction fetch memory violation.
321 // Spin wait until context saved to prevent interrupt storm.
323 s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG)
324 s_branch L_CHECK_SAVE
327 // Let second-level handle non-SAVECTX exception or trap.
328 // Any concurrent SAVECTX will be handled upon re-entry once halted.
330 // Check non-maskable exceptions. memory_violation, illegal_instruction
331 // and xnack_error exceptions always cause the wave to enter the trap
333 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_NON_MASKABLE_EXCP_MASK
334 s_cbranch_scc1 L_FETCH_2ND_TRAP
336 // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
337 // Maskable exceptions only cause the wave to enter the trap handler if
338 // their respective bit in mode.excp_en is set.
339 #if ASIC_FAMILY < CHIP_GFX12
340 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
341 s_cbranch_scc0 L_CHECK_TRAP_ID
343 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
344 s_cbranch_scc0 L_NOT_ADDR_WATCH
345 s_bitset1_b32 ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
348 s_getreg_b32 ttmp3, hwreg(HW_REG_MODE)
349 s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
350 s_and_b32 ttmp2, ttmp2, ttmp3
351 s_cbranch_scc1 L_FETCH_2ND_TRAP
353 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
354 s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
355 s_cbranch_scc0 L_NOT_ADDR_WATCH
356 s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK
359 s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL)
360 s_and_b32 ttmp2, ttmp3, ttmp2
361 s_cbranch_scc1 L_FETCH_2ND_TRAP
365 // Check trap_id != 0
366 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
367 s_cbranch_scc1 L_FETCH_2ND_TRAP
369 #if SINGLE_STEP_MISSED_WORKAROUND
370 // Prioritize single step exception over context save.
371 // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
372 #if ASIC_FAMILY < CHIP_GFX12
373 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE)
374 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
376 // WAVE_TRAP_CTRL is already in ttmp3.
377 s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK
379 s_cbranch_scc1 L_FETCH_2ND_TRAP
382 s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK
383 s_cbranch_scc1 L_SAVE
387 save_and_clear_ib_sts(ttmp14, ttmp15)
390 // Read second-level TBA/TMA from first-level TMA and jump if available.
391 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
392 // ttmp12 holds SQ_WAVE_STATUS
394 s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
397 s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
398 s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
400 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
402 s_bitcmp1_b32 ttmp15, 0xF
403 s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA
404 s_or_b32 ttmp15, ttmp15, 0xFFFF0000
405 L_NO_SIGN_EXTEND_TMA:
407 s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag
409 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
410 s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
411 s_or_b32 ttmp11, ttmp11, ttmp2
413 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA
415 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA
418 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
419 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
420 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
423 // If not caused by trap then halt wave to prevent re-entry.
424 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
425 s_cbranch_scc1 L_TRAP_CASE
427 // Host trap will not cause trap re-entry.
428 #if ASIC_FAMILY < CHIP_GFX12
429 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
431 s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
432 s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
434 s_cbranch_scc1 L_EXIT_TRAP
435 s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK
437 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
438 // Rewind the PC to prevent this from occurring.
439 s_sub_u32 ttmp0, ttmp0, 0x8
440 s_subb_u32 ttmp1, ttmp1, 0x0
445 // Advance past trap instruction to prevent re-entry.
446 s_add_u32 ttmp0, ttmp0, 0x4
447 s_addc_u32 ttmp1, ttmp1, 0x0
450 s_and_b32 ttmp1, ttmp1, 0xFFFF
453 restore_ib_sts(ttmp14, ttmp15)
456 // Restore SQ_WAVE_STATUS.
457 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
458 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
460 #if ASIC_FAMILY < CHIP_GFX12
461 s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status
463 // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
464 // Only restore fields which the trap handler changes.
465 s_lshr_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT
466 s_setreg_b32 hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
467 SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status
470 s_rfe_b64 [ttmp0, ttmp1]
473 // If VGPRs have been deallocated then terminate the wavefront.
474 // It has no remaining program to run and cannot save without VGPRs.
475 #if ASIC_FAMILY == CHIP_PLUM_BONITO
476 s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
477 s_cbranch_scc0 L_HAVE_VGPRS
481 #if ASIC_FAMILY >= CHIP_GFX12
482 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
483 s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
484 s_cbranch_scc0 L_HAVE_VGPRS
489 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
490 s_mov_b32 s_save_tmp, 0
491 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit
494 save_and_clear_ib_sts(s_save_tmp, s_save_trapsts)
497 /* inform SPI the readiness and wait for SPI's go signal */
498 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
499 s_mov_b32 s_save_exec_hi, exec_hi
500 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
503 s_sendmsg_rtn_b64 [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
505 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
508 #if ASIC_FAMILY < CHIP_SIENNA_CICHLID
510 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
511 // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
512 // other waves are stuck into the sleep-loop and waiting for wrexec!=0
514 s_cbranch_execz L_SLEEP
519 // Save first_wave flag so we can clear high bits of save address.
520 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
521 s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
522 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
525 #if ASIC_FAMILY <= CHIP_SIENNA_CICHLID
526 // gfx10: If there was a VALU exception, the exception state must be
527 // cleared before executing the VALU instructions below.
531 // Trap temporaries must be saved via VGPR but all VGPRs are in use.
532 // There is no ttmp space to hold the resource constant for VGPR save.
533 // Save v0 by itself since it requires only two SGPRs.
534 s_mov_b32 s_save_ttmps_lo, exec_lo
535 s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
536 s_mov_b32 exec_lo, 0xFFFFFFFF
537 s_mov_b32 exec_hi, 0xFFFFFFFF
538 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE
540 s_mov_b32 exec_lo, s_save_ttmps_lo
541 s_mov_b32 exec_hi, s_save_ttmps_hi
544 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
545 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
546 get_wave_size2(s_save_ttmps_hi)
547 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
548 get_svgpr_size_bytes(s_save_ttmps_hi)
549 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
550 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
551 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
552 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
553 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0
556 v_writelane_b32 v0, ttmp4, 0x4
557 v_writelane_b32 v0, ttmp5, 0x5
558 v_writelane_b32 v0, ttmp6, 0x6
559 v_writelane_b32 v0, ttmp7, 0x7
560 v_writelane_b32 v0, ttmp8, 0x8
561 v_writelane_b32 v0, ttmp9, 0x9
562 v_writelane_b32 v0, ttmp10, 0xA
563 v_writelane_b32 v0, ttmp11, 0xB
564 v_writelane_b32 v0, ttmp13, 0xD
565 v_writelane_b32 v0, exec_lo, 0xE
566 v_writelane_b32 v0, exec_hi, 0xF
568 s_mov_b32 exec_lo, 0x3FFF
569 s_mov_b32 exec_hi, 0x0
570 global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE
571 v_readlane_b32 ttmp14, v0, 0xE
572 v_readlane_b32 ttmp15, v0, 0xF
573 s_mov_b32 exec_lo, ttmp14
574 s_mov_b32 exec_hi, ttmp15
576 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE
577 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE
578 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE
581 /* setup Resource Contants */
582 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
583 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
584 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
585 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
586 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
588 s_mov_b32 s_save_m0, m0
590 /* global mem offset */
591 s_mov_b32 s_save_mem_offset, 0x0
592 get_wave_size2(s_wave_size)
595 // Save and clear vector XNACK state late to free up SGPRs.
596 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
597 s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0
600 /* save first 4 VGPRs, needed for SGPR save */
601 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
602 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
605 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI
606 s_mov_b32 exec_hi, 0x00000000
607 s_branch L_SAVE_4VGPR_WAVE32
608 L_ENABLE_SAVE_4VGPR_EXEC_HI:
609 s_mov_b32 exec_hi, 0xFFFFFFFF
610 s_branch L_SAVE_4VGPR_WAVE64
612 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
614 // VGPR Allocated in 4-GPR granularity
616 #if SAVE_AFTER_XNACK_ERROR
617 check_if_tcp_store_ok()
618 s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP
620 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
621 s_branch L_SAVE_HWREG
623 L_SAVE_FIRST_VGPRS32_WITH_TCP:
627 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
629 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
630 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
631 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
632 s_branch L_SAVE_HWREG
635 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
637 // VGPR Allocated in 4-GPR granularity
639 #if SAVE_AFTER_XNACK_ERROR
640 check_if_tcp_store_ok()
641 s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP
643 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
644 s_branch L_SAVE_HWREG
646 L_SAVE_FIRST_VGPRS64_WITH_TCP:
650 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
652 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
653 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
654 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
656 /* save HW registers */
659 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
660 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
661 get_svgpr_size_bytes(s_save_tmp)
662 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
663 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
665 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
668 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource
669 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource
670 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store
671 s_mov_b32 m0, 0x0 //Next lane of v2 to write to
674 #if ASIC_FAMILY >= CHIP_GFX12
675 // Ensure no further changes to barrier or LDS state.
676 // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
680 // Re-read final state of BARRIER_COMPLETE field for save.
681 s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG)
682 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
683 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
684 s_or_b32 s_save_status, s_save_status, s_save_tmp
687 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
688 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
689 s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
690 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
691 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
692 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
693 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
695 s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG)
696 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
698 // Not used on Sienna_Cichlid but keep layout same for debugger.
699 write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
701 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE)
702 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
704 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
705 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
707 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
708 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
710 #if ASIC_FAMILY >= CHIP_GFX12
711 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
712 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
714 s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL)
715 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
717 s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
718 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
720 s_get_barrier_state s_save_tmp, -1
722 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
726 // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
727 s_mov_b32 exec_lo, 0xFFFF
728 s_mov_b32 exec_hi, 0x0
729 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
731 // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
732 s_mov_b32 exec_lo, 0xFFFFFFFF
736 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
738 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
739 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
740 get_svgpr_size_bytes(s_save_tmp)
741 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
742 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
745 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into
747 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
748 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0
749 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
750 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
753 s_mov_b32 m0, 0x0 //SGPR initial index value =0
754 s_nop 0x0 //Manually inserted wait states
756 // SGPR is allocated in 16 SGPR granularity
757 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
758 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
759 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
760 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
761 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
762 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
763 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
764 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
766 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
769 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
770 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
772 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
773 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
774 s_mov_b32 ttmp13, 0x0
776 L_SAVE_SGPR_SKIP_TCP_STORE:
779 s_add_u32 m0, m0, 16 //next sgpr index
780 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0
781 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete?
783 //save the rest 12 SGPR
784 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
785 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
786 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
787 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
788 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
789 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
790 write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
793 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
795 // restore s_save_buf_rsrc0,1
796 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask
802 // Change EXEC to all threads...
803 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
804 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
807 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
808 s_mov_b32 exec_hi, 0x00000000
809 s_branch L_SAVE_LDS_NORMAL
810 L_ENABLE_SAVE_LDS_EXEC_HI:
811 s_mov_b32 exec_hi, 0xFFFFFFFF
813 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
814 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
815 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
817 #if ASIC_FAMILY < CHIP_GFX12
818 s_barrier //LDS is used? wait for other waves in the same TG
820 s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
821 s_cbranch_scc0 L_SAVE_LDS_DONE
823 // first wave do LDS save;
825 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
826 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
828 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
830 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
831 get_svgpr_size_bytes(s_save_tmp)
832 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
833 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
834 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
836 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
838 //load 0~63*4(byte address) to vgpr v0
839 v_mbcnt_lo_u32_b32 v0, -1, 0
840 v_mbcnt_hi_u32_b32 v0, -1, v0
841 v_mul_u32_u24 v0, 4, v0
843 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
847 s_cbranch_scc1 L_SAVE_LDS_W64
850 #if SAVE_AFTER_XNACK_ERROR
851 check_if_tcp_store_ok()
852 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32
854 L_SAVE_LDS_LOOP_SQC_W32:
858 write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
860 s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
861 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
862 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
863 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete?
865 s_branch L_SAVE_LDS_DONE
867 L_SAVE_LDS_WITH_TCP_W32:
877 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
879 s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
880 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
881 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
882 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
883 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
885 s_branch L_SAVE_LDS_DONE
888 #if SAVE_AFTER_XNACK_ERROR
889 check_if_tcp_store_ok()
890 s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64
892 L_SAVE_LDS_LOOP_SQC_W64:
896 write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
898 s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
899 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
900 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
901 s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete?
903 s_branch L_SAVE_LDS_DONE
905 L_SAVE_LDS_WITH_TCP_W64:
915 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
917 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
918 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
919 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
920 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
921 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
924 /* save VGPRs - set the Rest VGPRs */
926 // VGPR SR memory offset: 0
927 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
928 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
931 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
932 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs
933 s_mov_b32 exec_hi, 0x00000000
934 s_branch L_SAVE_VGPR_NORMAL
935 L_ENABLE_SAVE_VGPR_EXEC_HI:
936 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
937 s_mov_b32 exec_hi, 0xFFFFFFFF
939 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
940 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
941 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
942 //determine it is wave32 or wave64
943 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
946 s_cbranch_scc1 L_SAVE_VGPR_WAVE64
948 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
950 // VGPR Allocated in 4-GPR granularity
952 // VGPR store using dw burst
953 s_mov_b32 m0, 0x4 //VGPR initial index value =4
954 s_cmp_lt_u32 m0, s_save_alloc_size
955 s_cbranch_scc0 L_SAVE_VGPR_END
957 #if SAVE_AFTER_XNACK_ERROR
958 check_if_tcp_store_ok()
959 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP
961 L_SAVE_VGPR_LOOP_SQC_W32:
962 v_movrels_b32 v0, v0 //v0 = v[0+m0]
963 v_movrels_b32 v1, v1 //v1 = v[1+m0]
964 v_movrels_b32 v2, v2 //v2 = v[2+m0]
965 v_movrels_b32 v3, v3 //v3 = v[3+m0]
967 write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
970 s_cmp_lt_u32 m0, s_save_alloc_size
971 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32
973 s_branch L_SAVE_VGPR_END
976 L_SAVE_VGPR_W32_LOOP:
977 v_movrels_b32 v0, v0 //v0 = v[0+m0]
978 v_movrels_b32 v1, v1 //v1 = v[1+m0]
979 v_movrels_b32 v2, v2 //v2 = v[2+m0]
980 v_movrels_b32 v3, v3 //v3 = v[3+m0]
982 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
983 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
984 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
985 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
987 s_add_u32 m0, m0, 4 //next vgpr index
988 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
989 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
990 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete?
992 s_branch L_SAVE_VGPR_END
995 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
997 // VGPR store using dw burst
998 s_mov_b32 m0, 0x4 //VGPR initial index value =4
999 s_cmp_lt_u32 m0, s_save_alloc_size
1000 s_cbranch_scc0 L_SAVE_SHARED_VGPR
1002 #if SAVE_AFTER_XNACK_ERROR
1003 check_if_tcp_store_ok()
1004 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP
1006 L_SAVE_VGPR_LOOP_SQC_W64:
1007 v_movrels_b32 v0, v0 //v0 = v[0+m0]
1008 v_movrels_b32 v1, v1 //v1 = v[1+m0]
1009 v_movrels_b32 v2, v2 //v2 = v[2+m0]
1010 v_movrels_b32 v3, v3 //v3 = v[3+m0]
1012 write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
1015 s_cmp_lt_u32 m0, s_save_alloc_size
1016 s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64
1018 s_branch L_SAVE_VGPR_END
1021 L_SAVE_VGPR_W64_LOOP:
1022 v_movrels_b32 v0, v0 //v0 = v[0+m0]
1023 v_movrels_b32 v1, v1 //v1 = v[1+m0]
1024 v_movrels_b32 v2, v2 //v2 = v[2+m0]
1025 v_movrels_b32 v3, v3 //v3 = v[3+m0]
1027 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
1028 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
1029 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
1030 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
1032 s_add_u32 m0, m0, 4 //next vgpr index
1033 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
1034 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
1035 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
1038 //Below part will be the save shared vgpr part (new for gfx10)
1039 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1040 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
1041 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
1042 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
1043 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
1044 //save shared_vgpr will start from the index of m0
1045 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
1046 s_mov_b32 exec_lo, 0xFFFFFFFF
1047 s_mov_b32 exec_hi, 0x00000000
1049 #if SAVE_AFTER_XNACK_ERROR
1050 check_if_tcp_store_ok()
1051 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP
1053 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC:
1054 v_movrels_b32 v0, v0
1056 write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
1059 s_cmp_lt_u32 m0, s_save_alloc_size
1060 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC
1062 s_branch L_SAVE_VGPR_END
1065 L_SAVE_SHARED_VGPR_WAVE64_LOOP:
1066 v_movrels_b32 v0, v0 //v0 = v[0+m0]
1067 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
1068 s_add_u32 m0, m0, 1 //next vgpr index
1069 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
1070 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
1071 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
1077 /* Setup Resource Contants */
1078 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
1079 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
1080 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
1081 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
1082 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
1084 #if ASIC_FAMILY >= CHIP_GFX12
1085 // Save s_restore_spi_init_hi for later use.
1086 s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
1089 //determine it is wave32 or wave64
1090 get_wave_size2(s_restore_size)
1092 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1093 s_cbranch_scc0 L_RESTORE_VGPR
1097 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
1098 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
1101 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
1102 s_mov_b32 exec_hi, 0x00000000
1103 s_branch L_RESTORE_LDS_NORMAL
1104 L_ENABLE_RESTORE_LDS_EXEC_HI:
1105 s_mov_b32 exec_hi, 0xFFFFFFFF
1106 L_RESTORE_LDS_NORMAL:
1107 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
1108 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
1109 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
1110 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
1111 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
1113 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
1115 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1116 get_svgpr_size_bytes(s_restore_tmp)
1117 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1118 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1119 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
1121 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1123 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
1127 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
1129 L_RESTORE_LDS_LOOP_W32:
1130 #if HAVE_BUFFER_LDS_LOAD
1131 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
1133 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
1135 ds_store_addtid_b32 v0
1137 s_add_u32 m0, m0, 128 // 128 DW
1138 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
1139 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
1140 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
1141 s_branch L_RESTORE_VGPR
1143 L_RESTORE_LDS_LOOP_W64:
1144 #if HAVE_BUFFER_LDS_LOAD
1145 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
1147 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
1149 ds_store_addtid_b32 v0
1151 s_add_u32 m0, m0, 256 // 256 DW
1152 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
1153 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
1154 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
1158 // VGPR SR memory offset : 0
1159 s_mov_b32 s_restore_mem_offset, 0x0
1160 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
1161 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
1164 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
1165 s_mov_b32 exec_hi, 0x00000000
1166 s_branch L_RESTORE_VGPR_NORMAL
1167 L_ENABLE_RESTORE_VGPR_EXEC_HI:
1168 s_mov_b32 exec_hi, 0xFFFFFFFF
1169 L_RESTORE_VGPR_NORMAL:
1170 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1171 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
1172 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
1173 //determine it is wave32 or wave64
1174 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
1177 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
1179 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1181 // VGPR load using dw burst
1182 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
1183 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
1184 s_mov_b32 m0, 4 //VGPR initial index value = 4
1185 s_cmp_lt_u32 m0, s_restore_alloc_size
1186 s_cbranch_scc0 L_RESTORE_SGPR
1188 L_RESTORE_VGPR_WAVE32_LOOP:
1189 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1190 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128
1191 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2
1192 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3
1194 v_movreld_b32 v0, v0 //v[0+m0] = v0
1195 v_movreld_b32 v1, v1
1196 v_movreld_b32 v2, v2
1197 v_movreld_b32 v3, v3
1198 s_add_u32 m0, m0, 4 //next vgpr index
1199 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes
1200 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1201 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
1203 /* VGPR restore on v0 */
1204 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
1205 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128
1206 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2
1207 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3
1210 s_branch L_RESTORE_SGPR
1212 L_RESTORE_VGPR_WAVE64:
1213 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1215 // VGPR load using dw burst
1216 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
1217 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
1218 s_mov_b32 m0, 4 //VGPR initial index value = 4
1219 s_cmp_lt_u32 m0, s_restore_alloc_size
1220 s_cbranch_scc0 L_RESTORE_SHARED_VGPR
1222 L_RESTORE_VGPR_WAVE64_LOOP:
1223 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1224 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256
1225 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2
1226 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3
1228 v_movreld_b32 v0, v0 //v[0+m0] = v0
1229 v_movreld_b32 v1, v1
1230 v_movreld_b32 v2, v2
1231 v_movreld_b32 v3, v3
1232 s_add_u32 m0, m0, 4 //next vgpr index
1233 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
1234 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1235 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
1237 L_RESTORE_SHARED_VGPR:
1238 //Below part will be the restore shared vgpr part (new for gfx10)
1239 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
1240 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
1241 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
1242 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
1243 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
1244 //restore shared_vgpr will start from the index of m0
1245 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
1246 s_mov_b32 exec_lo, 0xFFFFFFFF
1247 s_mov_b32 exec_hi, 0x00000000
1248 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
1249 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1251 v_movreld_b32 v0, v0 //v[0+m0] = v0
1252 s_add_u32 m0, m0, 1 //next vgpr index
1253 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
1254 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1255 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
1257 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
1259 /* VGPR restore on v0 */
1261 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
1262 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256
1263 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2
1264 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3
1269 // SGPR SR memory offset : size(VGPR)+size(SVGPR)
1271 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1272 get_svgpr_size_bytes(s_restore_tmp)
1273 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1274 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1275 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved
1277 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1279 s_mov_b32 m0, s_sgpr_save_num
1281 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1284 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104]
1285 s_nop 0 // hazard SALU M0=> S_MOVREL
1287 s_movreld_b64 s0, s0 //s[0+m0] = s0
1288 s_movreld_b64 s2, s2
1290 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1293 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96]
1294 s_nop 0 // hazard SALU M0=> S_MOVREL
1296 s_movreld_b64 s0, s0 //s[0+m0] = s0
1297 s_movreld_b64 s2, s2
1298 s_movreld_b64 s4, s4
1299 s_movreld_b64 s6, s6
1301 L_RESTORE_SGPR_LOOP:
1302 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1305 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
1306 s_nop 0 // hazard SALU M0=> S_MOVREL
1308 s_movreld_b64 s0, s0 //s[0+m0] = s0
1309 s_movreld_b64 s2, s2
1310 s_movreld_b64 s4, s4
1311 s_movreld_b64 s6, s6
1312 s_movreld_b64 s8, s8
1313 s_movreld_b64 s10, s10
1314 s_movreld_b64 s12, s12
1315 s_movreld_b64 s14, s14
1317 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0
1318 s_cbranch_scc0 L_RESTORE_SGPR_LOOP
1320 // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception.
1321 // Clear DEBUG_EN before and restore MODE after the barrier.
1322 s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0
1323 #if ASIC_FAMILY < CHIP_GFX12
1324 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
1327 /* restore HW registers */
1329 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
1330 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1331 get_svgpr_size_bytes(s_restore_tmp)
1332 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1333 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1335 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1337 #if ASIC_FAMILY >= CHIP_GFX12
1338 // Restore s_restore_spi_init_hi before the saved value gets clobbered.
1339 s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
1342 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
1343 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1344 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1345 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1346 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1347 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
1348 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
1349 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
1350 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
1351 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1354 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
1356 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1359 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
1361 #if ASIC_FAMILY >= CHIP_GFX12
1362 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1364 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
1366 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1368 s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
1370 // Only the first wave needs to restore the workgroup barrier.
1371 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1372 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
1374 // Skip over WAVE_STATUS, since there is no state to restore from it
1375 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4
1377 read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1380 s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
1381 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
1383 // extract the saved signal count from s_restore_tmp
1384 s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
1386 // We need to call s_barrier_signal repeatedly to restore the signal
1387 // count of the work group barrier. The member count is already
1388 // initialized with the number of waves in the work group.
1389 L_BARRIER_RESTORE_LOOP:
1390 s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
1391 s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
1393 s_add_i32 s_restore_tmp, s_restore_tmp, -1
1394 s_branch L_BARRIER_RESTORE_LOOP
1396 L_SKIP_BARRIER_RESTORE:
1399 s_mov_b32 m0, s_restore_m0
1400 s_mov_b32 exec_lo, s_restore_exec_lo
1401 s_mov_b32 exec_hi, s_restore_exec_hi
1404 s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
1407 // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed.
1408 // Only restore the other fields to avoid clobbering them.
1409 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts
1410 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT
1411 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts
1413 if S_TRAPSTS_RESTORE_PART_3_SIZE > 0
1414 s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT
1415 s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts
1418 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
1420 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1421 // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1422 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1423 get_svgpr_size_bytes(s_restore_ttmps_hi)
1424 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1425 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1426 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1427 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1428 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1429 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE
1430 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE
1431 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE
1435 restore_ib_sts(s_restore_tmp, s_restore_m0)
1438 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1439 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1440 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1443 // If traps are enabled then return to the shader with PRIV=0.
1444 // Otherwise retain PRIV=1 for subsequent context save requests.
1445 s_getreg_b32 s_restore_tmp, hwreg(HW_REG_STATUS)
1446 s_bitcmp1_b32 s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT
1447 s_cbranch_scc1 L_RETURN_WITHOUT_PRIV
1449 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
1450 s_setpc_b64 [s_restore_pc_lo, s_restore_pc_hi]
1451 L_RETURN_WITHOUT_PRIV:
1454 s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu
1456 #if ASIC_FAMILY >= CHIP_GFX12
1457 // Make barrier and LDS state visible to all waves in the group.
1458 // STATE_PRIV.BARRIER_COMPLETE may change after this point.
1463 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1469 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1471 // Copy into VGPR for later TCP store.
1472 v_writelane_b32 v2, s, m0
1473 s_add_u32 m0, m0, 0x1
1475 s_mov_b32 exec_lo, m0
1476 s_mov_b32 m0, s_mem_offset
1477 s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE
1478 s_add_u32 s_mem_offset, s_mem_offset, 4
1479 s_mov_b32 m0, exec_lo
1484 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1486 // Copy into VGPR for later TCP store.
1487 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1488 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1489 s_add_u32 ttmp13, ttmp13, 0x1
1492 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE
1493 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE
1494 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE
1495 s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE
1496 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
1497 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
1501 function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
1503 // Copy into VGPR for later TCP store.
1504 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1505 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1506 s_add_u32 ttmp13, ttmp13, 0x1
1509 s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE
1510 s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE
1511 s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE
1512 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12
1513 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
1517 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1518 s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE
1519 s_add_u32 s_mem_offset, s_mem_offset, 4
1522 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1523 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
1524 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE
1527 function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1528 s_sub_u32 s_mem_offset, s_mem_offset, 4*8
1529 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE
1532 function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1533 s_sub_u32 s_mem_offset, s_mem_offset, 4*4
1534 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE
1537 #if SAVE_AFTER_XNACK_ERROR
1538 function check_if_tcp_store_ok
1539 // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
1540 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
1541 s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
1543 L_TCP_STORE_CHECK_DONE:
1546 function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset)
1549 L_WRITE_VGPR_LANE_LOOP:
1550 for var lane = 0; lane < 4; ++lane
1551 v_readlane_b32 s[lane], vgpr, s4
1555 s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
1557 s_add_u32 s_mem_offset, s_mem_offset, 0x10
1558 s_cmp_eq_u32 s4, n_lanes
1559 s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
1562 function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
1563 for var vgpr = 0; vgpr < n_vgprs; ++vgpr
1564 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset)
1568 function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
1569 for var vgpr = 0; vgpr < n_vgprs; ++vgpr
1570 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset)
1575 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1576 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1577 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1578 s_bitcmp1_b32 s_size, S_WAVE_SIZE
1579 s_cbranch_scc1 L_ENABLE_SHIFT_W64
1580 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value)
1581 s_branch L_SHIFT_DONE
1583 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value)
1587 function get_svgpr_size_bytes(s_svgpr_size_byte)
1588 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1589 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1592 function get_sgpr_size_bytes
1596 function get_hwreg_size_bytes
1600 function get_wave_size2(s_reg)
1601 #if ASIC_FAMILY < CHIP_GFX12
1602 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
1604 s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
1606 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE
1610 function save_and_clear_ib_sts(tmp1, tmp2)
1611 // Preserve and clear scalar XNACK state before issuing scalar loads.
1612 // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
1613 // unused space ttmp11[31:24].
1614 s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
1615 s_getreg_b32 tmp1, hwreg(HW_REG_IB_STS)
1616 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1617 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1618 s_or_b32 ttmp11, ttmp11, tmp2
1619 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1620 s_lshl_b32 tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1621 s_or_b32 ttmp11, ttmp11, tmp2
1622 s_andn2_b32 tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
1623 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1
1626 function restore_ib_sts(tmp1, tmp2)
1627 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1628 s_and_b32 tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1629 s_lshr_b32 tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1630 s_and_b32 tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1631 s_or_b32 tmp1, tmp1, tmp2
1632 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1