]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
crypto: akcipher - Drop sign/verify operations
[linux.git] / drivers / gpu / drm / amd / amdkfd / cwsr_trap_handler_gfx10.asm
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23 /* To compile this assembly code:
24  *
25  * Navi1x:
26  *   cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3
27  *   sp3 nv1x.sp3 -hex nv1x.hex
28  *
29  * gfx10:
30  *   cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3
31  *   sp3 gfx10.sp3 -hex gfx10.hex
32  *
33  * gfx11:
34  *   cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3
35  *   sp3 gfx11.sp3 -hex gfx11.hex
36  *
37  * gfx12:
38  *   cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx10.asm -P -o gfx12.sp3
39  *   sp3 gfx12.sp3 -hex gfx12.hex
40  */
41
42 #define CHIP_NAVI10 26
43 #define CHIP_SIENNA_CICHLID 30
44 #define CHIP_PLUM_BONITO 36
45 #define CHIP_GFX12 37
46
47 #define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID)
48 #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID)
49 #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
50 #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
51 #define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO && ASIC_FAMILY < CHIP_GFX12)
52 #define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
53 #define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
54
55 #if ASIC_FAMILY < CHIP_GFX12
56 #define S_COHERENCE glc:1
57 #define V_COHERENCE slc:1 glc:1
58 #define S_WAITCNT_0 s_waitcnt 0
59 #else
60 #define S_COHERENCE scope:SCOPE_SYS
61 #define V_COHERENCE scope:SCOPE_SYS
62 #define S_WAITCNT_0 s_wait_idle
63
64 #define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO
65 #define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI
66 #define HW_REG_GPR_ALLOC HW_REG_WAVE_GPR_ALLOC
67 #define HW_REG_LDS_ALLOC HW_REG_WAVE_LDS_ALLOC
68 #define HW_REG_MODE HW_REG_WAVE_MODE
69 #endif
70
71 #if ASIC_FAMILY < CHIP_GFX12
72 var SQ_WAVE_STATUS_SPI_PRIO_MASK                = 0x00000006
73 var SQ_WAVE_STATUS_HALT_MASK                    = 0x2000
74 var SQ_WAVE_STATUS_ECC_ERR_MASK                 = 0x20000
75 var SQ_WAVE_STATUS_TRAP_EN_SHIFT                = 6
76 var SQ_WAVE_IB_STS2_WAVE64_SHIFT                = 11
77 var SQ_WAVE_IB_STS2_WAVE64_SIZE                 = 1
78 var SQ_WAVE_LDS_ALLOC_GRANULARITY               = 8
79 var S_STATUS_HWREG                              = HW_REG_STATUS
80 var S_STATUS_ALWAYS_CLEAR_MASK                  = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
81 var S_STATUS_HALT_MASK                          = SQ_WAVE_STATUS_HALT_MASK
82 var S_SAVE_PC_HI_TRAP_ID_MASK                   = 0x00FF0000
83 var S_SAVE_PC_HI_HT_MASK                        = 0x01000000
84 #else
85 var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK    = 0x4
86 var SQ_WAVE_STATE_PRIV_SCC_SHIFT                = 9
87 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK            = 0xC00
88 var SQ_WAVE_STATE_PRIV_HALT_MASK                = 0x4000
89 var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK          = 0x8000
90 var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT         = 15
91 var SQ_WAVE_STATUS_WAVE64_SHIFT                 = 29
92 var SQ_WAVE_STATUS_WAVE64_SIZE                  = 1
93 var SQ_WAVE_LDS_ALLOC_GRANULARITY               = 9
94 var S_STATUS_HWREG                              = HW_REG_WAVE_STATE_PRIV
95 var S_STATUS_ALWAYS_CLEAR_MASK                  = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
96 var S_STATUS_HALT_MASK                          = SQ_WAVE_STATE_PRIV_HALT_MASK
97 var S_SAVE_PC_HI_TRAP_ID_MASK                   = 0xF0000000
98 #endif
99
100 var SQ_WAVE_STATUS_NO_VGPRS_SHIFT               = 24
101 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT            = 12
102 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE             = 9
103 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE            = 8
104 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
105 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE     = 4
106
107 #if ASIC_FAMILY < CHIP_PLUM_BONITO
108 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT           = 8
109 #else
110 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT           = 12
111 #endif
112
113 #if ASIC_FAMILY < CHIP_GFX12
114 var SQ_WAVE_TRAPSTS_SAVECTX_MASK                = 0x400
115 var SQ_WAVE_TRAPSTS_EXCP_MASK                   = 0x1FF
116 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT               = 10
117 var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK             = 0x80
118 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT            = 7
119 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK               = 0x100
120 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT              = 8
121 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK           = 0x800
122 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT          = 11
123 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK                = 0x7000
124 #if ASIC_FAMILY >= CHIP_PLUM_BONITO
125 var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT             = 16
126 var SQ_WAVE_TRAPSTS_WAVE_START_MASK             = 0x20000
127 var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT            = 17
128 var SQ_WAVE_TRAPSTS_WAVE_END_MASK               = 0x40000
129 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK        = 0x100000
130 #endif
131 var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK            = 0x10000000
132
133 var SQ_WAVE_MODE_EXCP_EN_SHIFT                  = 12
134 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT       = 19
135
136 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT           = 15
137 var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT            = 25
138 var SQ_WAVE_IB_STS_REPLAY_W64H_MASK             = 0x02000000
139 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK       = 0x003F8000
140
141 var SQ_WAVE_MODE_DEBUG_EN_MASK                  = 0x800
142
143 var S_TRAPSTS_RESTORE_PART_1_SIZE               = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT
144 var S_TRAPSTS_RESTORE_PART_2_SHIFT              = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
145
146 #if ASIC_FAMILY < CHIP_PLUM_BONITO
147 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK            = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
148 var S_TRAPSTS_RESTORE_PART_2_SIZE               = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT
149 var S_TRAPSTS_RESTORE_PART_3_SHIFT              = 0
150 var S_TRAPSTS_RESTORE_PART_3_SIZE               = 0
151 #else
152 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK            = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK         |\
153                                                   SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK     |\
154                                                   SQ_WAVE_TRAPSTS_WAVE_START_MASK       |\
155                                                   SQ_WAVE_TRAPSTS_WAVE_END_MASK         |\
156                                                   SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK
157 var S_TRAPSTS_RESTORE_PART_2_SIZE               = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
158 var S_TRAPSTS_RESTORE_PART_3_SHIFT              = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT
159 var S_TRAPSTS_RESTORE_PART_3_SIZE               = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT
160 #endif
161 var S_TRAPSTS_HWREG                             = HW_REG_TRAPSTS
162 var S_TRAPSTS_SAVE_CONTEXT_MASK                 = SQ_WAVE_TRAPSTS_SAVECTX_MASK
163 var S_TRAPSTS_SAVE_CONTEXT_SHIFT                = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT
164 #else
165 var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK      = 0xF
166 var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK        = 0x10
167 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT   = 5
168 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK    = 0x20
169 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK    = 0x40
170 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT   = 6
171 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK       = 0x80
172 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT      = 7
173 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK      = 0x100
174 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT     = 8
175 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK        = 0x200
176 var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800
177 var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK           = 0x80
178 var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK      = 0x200
179
180 var S_TRAPSTS_HWREG                             = HW_REG_WAVE_EXCP_FLAG_PRIV
181 var S_TRAPSTS_SAVE_CONTEXT_MASK                 = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK
182 var S_TRAPSTS_SAVE_CONTEXT_SHIFT                = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
183 var S_TRAPSTS_NON_MASKABLE_EXCP_MASK            = SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK          |\
184                                                   SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK      |\
185                                                   SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK         |\
186                                                   SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK        |\
187                                                   SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK          |\
188                                                   SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK
189 var S_TRAPSTS_RESTORE_PART_1_SIZE               = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT
190 var S_TRAPSTS_RESTORE_PART_2_SHIFT              = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
191 var S_TRAPSTS_RESTORE_PART_2_SIZE               = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
192 var S_TRAPSTS_RESTORE_PART_3_SHIFT              = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
193 var S_TRAPSTS_RESTORE_PART_3_SIZE               = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT
194 var BARRIER_STATE_SIGNAL_OFFSET                 = 16
195 var BARRIER_STATE_VALID_OFFSET                  = 0
196 #endif
197
198 // bits [31:24] unused by SPI debug data
199 var TTMP11_SAVE_REPLAY_W64H_SHIFT               = 31
200 var TTMP11_SAVE_REPLAY_W64H_MASK                = 0x80000000
201 var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT         = 24
202 var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK          = 0x7F000000
203 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT             = 23
204 var TTMP11_DEBUG_TRAP_ENABLED_MASK              = 0x800000
205
206 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
207 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
208 var S_SAVE_BUF_RSRC_WORD1_STRIDE                = 0x00040000
209 var S_SAVE_BUF_RSRC_WORD3_MISC                  = 0x10807FAC
210 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK             = 0x04000000
211 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT            = 26
212
213 var S_SAVE_PC_HI_FIRST_WAVE_MASK                = 0x80000000
214 var S_SAVE_PC_HI_FIRST_WAVE_SHIFT               = 31
215
216 var s_sgpr_save_num                             = 108
217
218 var s_save_spi_init_lo                          = exec_lo
219 var s_save_spi_init_hi                          = exec_hi
220 var s_save_pc_lo                                = ttmp0
221 var s_save_pc_hi                                = ttmp1
222 var s_save_exec_lo                              = ttmp2
223 var s_save_exec_hi                              = ttmp3
224 var s_save_status                               = ttmp12
225 var s_save_trapsts                              = ttmp15
226 var s_save_xnack_mask                           = s_save_trapsts
227 var s_wave_size                                 = ttmp7
228 var s_save_buf_rsrc0                            = ttmp8
229 var s_save_buf_rsrc1                            = ttmp9
230 var s_save_buf_rsrc2                            = ttmp10
231 var s_save_buf_rsrc3                            = ttmp11
232 var s_save_mem_offset                           = ttmp4
233 var s_save_alloc_size                           = s_save_trapsts
234 var s_save_tmp                                  = ttmp14
235 var s_save_m0                                   = ttmp5
236 var s_save_ttmps_lo                             = s_save_tmp
237 var s_save_ttmps_hi                             = s_save_trapsts
238
239 var S_RESTORE_BUF_RSRC_WORD1_STRIDE             = S_SAVE_BUF_RSRC_WORD1_STRIDE
240 var S_RESTORE_BUF_RSRC_WORD3_MISC               = S_SAVE_BUF_RSRC_WORD3_MISC
241
242 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK          = 0x04000000
243 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT         = 26
244 var S_WAVE_SIZE                                 = 25
245
246 var s_restore_spi_init_lo                       = exec_lo
247 var s_restore_spi_init_hi                       = exec_hi
248 var s_restore_mem_offset                        = ttmp12
249 var s_restore_alloc_size                        = ttmp3
250 var s_restore_tmp                               = ttmp2
251 var s_restore_mem_offset_save                   = s_restore_tmp
252 var s_restore_m0                                = s_restore_alloc_size
253 var s_restore_mode                              = ttmp7
254 var s_restore_flat_scratch                      = s_restore_tmp
255 var s_restore_pc_lo                             = ttmp0
256 var s_restore_pc_hi                             = ttmp1
257 var s_restore_exec_lo                           = ttmp4
258 var s_restore_exec_hi                           = ttmp5
259 var s_restore_status                            = ttmp14
260 var s_restore_trapsts                           = ttmp15
261 var s_restore_xnack_mask                        = ttmp13
262 var s_restore_buf_rsrc0                         = ttmp8
263 var s_restore_buf_rsrc1                         = ttmp9
264 var s_restore_buf_rsrc2                         = ttmp10
265 var s_restore_buf_rsrc3                         = ttmp11
266 var s_restore_size                              = ttmp6
267 var s_restore_ttmps_lo                          = s_restore_tmp
268 var s_restore_ttmps_hi                          = s_restore_alloc_size
269 var s_restore_spi_init_hi_save                  = s_restore_exec_hi
270
271 shader main
272         asic(DEFAULT)
273         type(CS)
274         wave_size(32)
275
276         s_branch        L_SKIP_RESTORE                                          //NOT restore. might be a regular trap or save
277
278 L_JUMP_TO_RESTORE:
279         s_branch        L_RESTORE
280
281 L_SKIP_RESTORE:
282         s_getreg_b32    s_save_status, hwreg(S_STATUS_HWREG)                    //save STATUS since we will change SCC
283
284         // Clear SPI_PRIO: do not save with elevated priority.
285         // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
286         s_andn2_b32     s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK
287
288         s_getreg_b32    s_save_trapsts, hwreg(S_TRAPSTS_HWREG)
289
290 #if SW_SA_TRAP
291         // If ttmp1[30] is set then issue s_barrier to unblock dependent waves.
292         s_bitcmp1_b32   s_save_pc_hi, 30
293         s_cbranch_scc0  L_TRAP_NO_BARRIER
294         s_barrier
295
296 L_TRAP_NO_BARRIER:
297         // If ttmp1[31] is set then trap may occur early.
298         // Spin wait until SAVECTX exception is raised.
299         s_bitcmp1_b32   s_save_pc_hi, 31
300         s_cbranch_scc1  L_CHECK_SAVE
301 #endif
302
303         s_and_b32       ttmp2, s_save_status, S_STATUS_HALT_MASK
304         s_cbranch_scc0  L_NOT_HALTED
305
306 L_HALTED:
307         // Host trap may occur while wave is halted.
308 #if ASIC_FAMILY < CHIP_GFX12
309         s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
310 #else
311         s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
312 #endif
313         s_cbranch_scc1  L_FETCH_2ND_TRAP
314
315 L_CHECK_SAVE:
316         s_and_b32       ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK
317         s_cbranch_scc1  L_SAVE
318
319         // Wave is halted but neither host trap nor SAVECTX is raised.
320         // Caused by instruction fetch memory violation.
321         // Spin wait until context saved to prevent interrupt storm.
322         s_sleep         0x10
323         s_getreg_b32    s_save_trapsts, hwreg(S_TRAPSTS_HWREG)
324         s_branch        L_CHECK_SAVE
325
326 L_NOT_HALTED:
327         // Let second-level handle non-SAVECTX exception or trap.
328         // Any concurrent SAVECTX will be handled upon re-entry once halted.
329
330         // Check non-maskable exceptions. memory_violation, illegal_instruction
331         // and xnack_error exceptions always cause the wave to enter the trap
332         // handler.
333         s_and_b32       ttmp2, s_save_trapsts, S_TRAPSTS_NON_MASKABLE_EXCP_MASK
334         s_cbranch_scc1  L_FETCH_2ND_TRAP
335
336         // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
337         // Maskable exceptions only cause the wave to enter the trap handler if
338         // their respective bit in mode.excp_en is set.
339 #if ASIC_FAMILY < CHIP_GFX12
340         s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
341         s_cbranch_scc0  L_CHECK_TRAP_ID
342
343         s_and_b32       ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
344         s_cbranch_scc0  L_NOT_ADDR_WATCH
345         s_bitset1_b32   ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
346
347 L_NOT_ADDR_WATCH:
348         s_getreg_b32    ttmp3, hwreg(HW_REG_MODE)
349         s_lshl_b32      ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
350         s_and_b32       ttmp2, ttmp2, ttmp3
351         s_cbranch_scc1  L_FETCH_2ND_TRAP
352 #else
353         s_getreg_b32    ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
354         s_and_b32       ttmp3, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
355         s_cbranch_scc0  L_NOT_ADDR_WATCH
356         s_or_b32        ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK
357
358 L_NOT_ADDR_WATCH:
359         s_getreg_b32    ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL)
360         s_and_b32       ttmp2, ttmp3, ttmp2
361         s_cbranch_scc1  L_FETCH_2ND_TRAP
362 #endif
363
364 L_CHECK_TRAP_ID:
365         // Check trap_id != 0
366         s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
367         s_cbranch_scc1  L_FETCH_2ND_TRAP
368
369 #if SINGLE_STEP_MISSED_WORKAROUND
370         // Prioritize single step exception over context save.
371         // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
372 #if ASIC_FAMILY < CHIP_GFX12
373         s_getreg_b32    ttmp2, hwreg(HW_REG_MODE)
374         s_and_b32       ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
375 #else
376         // WAVE_TRAP_CTRL is already in ttmp3.
377         s_and_b32       ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK
378 #endif
379         s_cbranch_scc1  L_FETCH_2ND_TRAP
380 #endif
381
382         s_and_b32       ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK
383         s_cbranch_scc1  L_SAVE
384
385 L_FETCH_2ND_TRAP:
386 #if HAVE_XNACK
387         save_and_clear_ib_sts(ttmp14, ttmp15)
388 #endif
389
390         // Read second-level TBA/TMA from first-level TMA and jump if available.
391         // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
392         // ttmp12 holds SQ_WAVE_STATUS
393 #if HAVE_SENDMSG_RTN
394         s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
395         S_WAITCNT_0
396 #else
397         s_getreg_b32    ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
398         s_getreg_b32    ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
399 #endif
400         s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
401
402         s_bitcmp1_b32   ttmp15, 0xF
403         s_cbranch_scc0  L_NO_SIGN_EXTEND_TMA
404         s_or_b32        ttmp15, ttmp15, 0xFFFF0000
405 L_NO_SIGN_EXTEND_TMA:
406
407         s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE               // debug trap enabled flag
408         S_WAITCNT_0
409         s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
410         s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
411         s_or_b32        ttmp11, ttmp11, ttmp2
412
413         s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE       // second-level TBA
414         S_WAITCNT_0
415         s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE     // second-level TMA
416         S_WAITCNT_0
417
418         s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
419         s_cbranch_scc0  L_NO_NEXT_TRAP                                          // second-level trap handler not been set
420         s_setpc_b64     [ttmp2, ttmp3]                                          // jump to second-level trap handler
421
422 L_NO_NEXT_TRAP:
423         // If not caused by trap then halt wave to prevent re-entry.
424         s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
425         s_cbranch_scc1  L_TRAP_CASE
426
427         // Host trap will not cause trap re-entry.
428 #if ASIC_FAMILY < CHIP_GFX12
429         s_and_b32       ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
430 #else
431         s_getreg_b32    ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
432         s_and_b32       ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK
433 #endif
434         s_cbranch_scc1  L_EXIT_TRAP
435         s_or_b32        s_save_status, s_save_status, S_STATUS_HALT_MASK
436
437         // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
438         // Rewind the PC to prevent this from occurring.
439         s_sub_u32       ttmp0, ttmp0, 0x8
440         s_subb_u32      ttmp1, ttmp1, 0x0
441
442         s_branch        L_EXIT_TRAP
443
444 L_TRAP_CASE:
445         // Advance past trap instruction to prevent re-entry.
446         s_add_u32       ttmp0, ttmp0, 0x4
447         s_addc_u32      ttmp1, ttmp1, 0x0
448
449 L_EXIT_TRAP:
450         s_and_b32       ttmp1, ttmp1, 0xFFFF
451
452 #if HAVE_XNACK
453         restore_ib_sts(ttmp14, ttmp15)
454 #endif
455
456         // Restore SQ_WAVE_STATUS.
457         s_and_b64       exec, exec, exec                                        // Restore STATUS.EXECZ, not writable by s_setreg_b32
458         s_and_b64       vcc, vcc, vcc                                           // Restore STATUS.VCCZ, not writable by s_setreg_b32
459
460 #if ASIC_FAMILY < CHIP_GFX12
461         s_setreg_b32    hwreg(S_STATUS_HWREG), s_save_status
462 #else
463         // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
464         // Only restore fields which the trap handler changes.
465         s_lshr_b32      s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT
466         s_setreg_b32    hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
467                 SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status
468 #endif
469
470         s_rfe_b64       [ttmp0, ttmp1]
471
472 L_SAVE:
473         // If VGPRs have been deallocated then terminate the wavefront.
474         // It has no remaining program to run and cannot save without VGPRs.
475 #if ASIC_FAMILY == CHIP_PLUM_BONITO
476         s_bitcmp1_b32   s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
477         s_cbranch_scc0  L_HAVE_VGPRS
478         s_endpgm
479 L_HAVE_VGPRS:
480 #endif
481 #if ASIC_FAMILY >= CHIP_GFX12
482         s_getreg_b32    s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
483         s_bitcmp1_b32   s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
484         s_cbranch_scc0  L_HAVE_VGPRS
485         s_endpgm
486 L_HAVE_VGPRS:
487 #endif
488
489         s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff                  //pc[47:32]
490         s_mov_b32       s_save_tmp, 0
491         s_setreg_b32    hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp     //clear saveCtx bit
492
493 #if HAVE_XNACK
494         save_and_clear_ib_sts(s_save_tmp, s_save_trapsts)
495 #endif
496
497         /* inform SPI the readiness and wait for SPI's go signal */
498         s_mov_b32       s_save_exec_lo, exec_lo                                 //save EXEC and use EXEC for the go signal from SPI
499         s_mov_b32       s_save_exec_hi, exec_hi
500         s_mov_b64       exec, 0x0                                               //clear EXEC to get ready to receive
501
502 #if HAVE_SENDMSG_RTN
503         s_sendmsg_rtn_b64       [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
504 #else
505         s_sendmsg       sendmsg(MSG_SAVEWAVE)                                   //send SPI a message and wait for SPI's write to EXEC
506 #endif
507
508 #if ASIC_FAMILY < CHIP_SIENNA_CICHLID
509 L_SLEEP:
510         // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
511         // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
512         // other waves are stuck into the sleep-loop and waiting for wrexec!=0
513         s_sleep         0x2
514         s_cbranch_execz L_SLEEP
515 #else
516         S_WAITCNT_0
517 #endif
518
519         // Save first_wave flag so we can clear high bits of save address.
520         s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
521         s_lshl_b32      s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
522         s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
523
524 #if NO_SQC_STORE
525 #if ASIC_FAMILY <= CHIP_SIENNA_CICHLID
526         // gfx10: If there was a VALU exception, the exception state must be
527         // cleared before executing the VALU instructions below.
528         v_clrexcp
529 #endif
530
531         // Trap temporaries must be saved via VGPR but all VGPRs are in use.
532         // There is no ttmp space to hold the resource constant for VGPR save.
533         // Save v0 by itself since it requires only two SGPRs.
534         s_mov_b32       s_save_ttmps_lo, exec_lo
535         s_and_b32       s_save_ttmps_hi, exec_hi, 0xFFFF
536         s_mov_b32       exec_lo, 0xFFFFFFFF
537         s_mov_b32       exec_hi, 0xFFFFFFFF
538         global_store_dword_addtid       v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE
539         v_mov_b32       v0, 0x0
540         s_mov_b32       exec_lo, s_save_ttmps_lo
541         s_mov_b32       exec_hi, s_save_ttmps_hi
542 #endif
543
544         // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
545         // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
546         get_wave_size2(s_save_ttmps_hi)
547         get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
548         get_svgpr_size_bytes(s_save_ttmps_hi)
549         s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
550         s_and_b32       s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
551         s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
552         s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
553         s_addc_u32      s_save_ttmps_hi, s_save_ttmps_hi, 0x0
554
555 #if NO_SQC_STORE
556         v_writelane_b32 v0, ttmp4, 0x4
557         v_writelane_b32 v0, ttmp5, 0x5
558         v_writelane_b32 v0, ttmp6, 0x6
559         v_writelane_b32 v0, ttmp7, 0x7
560         v_writelane_b32 v0, ttmp8, 0x8
561         v_writelane_b32 v0, ttmp9, 0x9
562         v_writelane_b32 v0, ttmp10, 0xA
563         v_writelane_b32 v0, ttmp11, 0xB
564         v_writelane_b32 v0, ttmp13, 0xD
565         v_writelane_b32 v0, exec_lo, 0xE
566         v_writelane_b32 v0, exec_hi, 0xF
567
568         s_mov_b32       exec_lo, 0x3FFF
569         s_mov_b32       exec_hi, 0x0
570         global_store_dword_addtid       v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE
571         v_readlane_b32  ttmp14, v0, 0xE
572         v_readlane_b32  ttmp15, v0, 0xF
573         s_mov_b32       exec_lo, ttmp14
574         s_mov_b32       exec_hi, ttmp15
575 #else
576         s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE
577         s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE
578         s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE
579 #endif
580
581         /* setup Resource Contants */
582         s_mov_b32       s_save_buf_rsrc0, s_save_spi_init_lo                    //base_addr_lo
583         s_and_b32       s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF        //base_addr_hi
584         s_or_b32        s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
585         s_mov_b32       s_save_buf_rsrc2, 0                                     //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
586         s_mov_b32       s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
587
588         s_mov_b32       s_save_m0, m0
589
590         /* global mem offset */
591         s_mov_b32       s_save_mem_offset, 0x0
592         get_wave_size2(s_wave_size)
593
594 #if HAVE_XNACK
595         // Save and clear vector XNACK state late to free up SGPRs.
596         s_getreg_b32    s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
597         s_setreg_imm32_b32      hwreg(HW_REG_SHADER_XNACK_MASK), 0x0
598 #endif
599
600         /* save first 4 VGPRs, needed for SGPR save */
601         s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
602         s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
603         s_and_b32       m0, m0, 1
604         s_cmp_eq_u32    m0, 1
605         s_cbranch_scc1  L_ENABLE_SAVE_4VGPR_EXEC_HI
606         s_mov_b32       exec_hi, 0x00000000
607         s_branch        L_SAVE_4VGPR_WAVE32
608 L_ENABLE_SAVE_4VGPR_EXEC_HI:
609         s_mov_b32       exec_hi, 0xFFFFFFFF
610         s_branch        L_SAVE_4VGPR_WAVE64
611 L_SAVE_4VGPR_WAVE32:
612         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
613
614         // VGPR Allocated in 4-GPR granularity
615
616 #if SAVE_AFTER_XNACK_ERROR
617         check_if_tcp_store_ok()
618         s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP
619
620         write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
621         s_branch L_SAVE_HWREG
622
623 L_SAVE_FIRST_VGPRS32_WITH_TCP:
624 #endif
625
626 #if !NO_SQC_STORE
627         buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
628 #endif
629         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
630         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
631         buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
632         s_branch        L_SAVE_HWREG
633
634 L_SAVE_4VGPR_WAVE64:
635         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
636
637         // VGPR Allocated in 4-GPR granularity
638
639 #if  SAVE_AFTER_XNACK_ERROR
640         check_if_tcp_store_ok()
641         s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP
642
643         write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
644         s_branch L_SAVE_HWREG
645
646 L_SAVE_FIRST_VGPRS64_WITH_TCP:
647 #endif
648
649 #if !NO_SQC_STORE
650         buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
651 #endif
652         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
653         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
654         buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
655
656         /* save HW registers */
657
658 L_SAVE_HWREG:
659         // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
660         get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
661         get_svgpr_size_bytes(s_save_tmp)
662         s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
663         s_add_u32       s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
664
665         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
666
667 #if NO_SQC_STORE
668         v_mov_b32       v0, 0x0                                                 //Offset[31:0] from buffer resource
669         v_mov_b32       v1, 0x0                                                 //Offset[63:32] from buffer resource
670         v_mov_b32       v2, 0x0                                                 //Set of SGPRs for TCP store
671         s_mov_b32       m0, 0x0                                                 //Next lane of v2 to write to
672 #endif
673
674 #if ASIC_FAMILY >= CHIP_GFX12
675         // Ensure no further changes to barrier or LDS state.
676         // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
677         s_barrier_signal        -2
678         s_barrier_wait  -2
679
680         // Re-read final state of BARRIER_COMPLETE field for save.
681         s_getreg_b32    s_save_tmp, hwreg(S_STATUS_HWREG)
682         s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
683         s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
684         s_or_b32        s_save_status, s_save_status, s_save_tmp
685 #endif
686
687         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
688         write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
689         s_andn2_b32     s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
690         write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
691         write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
692         write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
693         write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
694
695         s_getreg_b32    s_save_tmp, hwreg(S_TRAPSTS_HWREG)
696         write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
697
698         // Not used on Sienna_Cichlid but keep layout same for debugger.
699         write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
700
701         s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)
702         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
703
704         s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
705         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
706
707         s_getreg_b32    s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
708         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
709
710 #if ASIC_FAMILY >= CHIP_GFX12
711         s_getreg_b32    s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER)
712         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
713
714         s_getreg_b32    s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL)
715         write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
716
717         s_getreg_b32    s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
718         write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
719
720         s_get_barrier_state s_save_tmp, -1
721         s_wait_kmcnt (0)
722         write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
723 #endif
724
725 #if NO_SQC_STORE
726         // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
727         s_mov_b32       exec_lo, 0xFFFF
728         s_mov_b32       exec_hi, 0x0
729         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
730
731         // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
732         s_mov_b32       exec_lo, 0xFFFFFFFF
733 #endif
734
735         /* save SGPRs */
736         // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
737
738         // SGPR SR memory offset : size(VGPR)+size(SVGPR)
739         get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
740         get_svgpr_size_bytes(s_save_tmp)
741         s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
742         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
743
744 #if NO_SQC_STORE
745         s_mov_b32       ttmp13, 0x0                                             //next VGPR lane to copy SGPR into
746 #else
747         // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
748         s_mov_b32       s_save_xnack_mask, s_save_buf_rsrc0
749         s_add_u32       s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
750         s_addc_u32      s_save_buf_rsrc1, s_save_buf_rsrc1, 0
751 #endif
752
753         s_mov_b32       m0, 0x0                                                 //SGPR initial index value =0
754         s_nop           0x0                                                     //Manually inserted wait states
755 L_SAVE_SGPR_LOOP:
756         // SGPR is allocated in 16 SGPR granularity
757         s_movrels_b64   s0, s0                                                  //s0 = s[0+m0], s1 = s[1+m0]
758         s_movrels_b64   s2, s2                                                  //s2 = s[2+m0], s3 = s[3+m0]
759         s_movrels_b64   s4, s4                                                  //s4 = s[4+m0], s5 = s[5+m0]
760         s_movrels_b64   s6, s6                                                  //s6 = s[6+m0], s7 = s[7+m0]
761         s_movrels_b64   s8, s8                                                  //s8 = s[8+m0], s9 = s[9+m0]
762         s_movrels_b64   s10, s10                                                //s10 = s[10+m0], s11 = s[11+m0]
763         s_movrels_b64   s12, s12                                                //s12 = s[12+m0], s13 = s[13+m0]
764         s_movrels_b64   s14, s14                                                //s14 = s[14+m0], s15 = s[15+m0]
765
766         write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
767
768 #if NO_SQC_STORE
769         s_cmp_eq_u32    ttmp13, 0x20                                            //have 32 VGPR lanes filled?
770         s_cbranch_scc0  L_SAVE_SGPR_SKIP_TCP_STORE
771
772         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
773         s_add_u32       s_save_mem_offset, s_save_mem_offset, 0x80
774         s_mov_b32       ttmp13, 0x0
775         v_mov_b32       v2, 0x0
776 L_SAVE_SGPR_SKIP_TCP_STORE:
777 #endif
778
779         s_add_u32       m0, m0, 16                                              //next sgpr index
780         s_cmp_lt_u32    m0, 96                                                  //scc = (m0 < first 96 SGPR) ? 1 : 0
781         s_cbranch_scc1  L_SAVE_SGPR_LOOP                                        //first 96 SGPR save is complete?
782
783         //save the rest 12 SGPR
784         s_movrels_b64   s0, s0                                                  //s0 = s[0+m0], s1 = s[1+m0]
785         s_movrels_b64   s2, s2                                                  //s2 = s[2+m0], s3 = s[3+m0]
786         s_movrels_b64   s4, s4                                                  //s4 = s[4+m0], s5 = s[5+m0]
787         s_movrels_b64   s6, s6                                                  //s6 = s[6+m0], s7 = s[7+m0]
788         s_movrels_b64   s8, s8                                                  //s8 = s[8+m0], s9 = s[9+m0]
789         s_movrels_b64   s10, s10                                                //s10 = s[10+m0], s11 = s[11+m0]
790         write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
791
792 #if NO_SQC_STORE
793         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
794 #else
795         // restore s_save_buf_rsrc0,1
796         s_mov_b32       s_save_buf_rsrc0, s_save_xnack_mask
797 #endif
798
799         /* save LDS */
800
801 L_SAVE_LDS:
802         // Change EXEC to all threads...
803         s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
804         s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
805         s_and_b32       m0, m0, 1
806         s_cmp_eq_u32    m0, 1
807         s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI
808         s_mov_b32       exec_hi, 0x00000000
809         s_branch        L_SAVE_LDS_NORMAL
810 L_ENABLE_SAVE_LDS_EXEC_HI:
811         s_mov_b32       exec_hi, 0xFFFFFFFF
812 L_SAVE_LDS_NORMAL:
813         s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
814         s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF        //lds_size is zero?
815         s_cbranch_scc0  L_SAVE_LDS_DONE                                         //no lds used? jump to L_SAVE_DONE
816
817 #if ASIC_FAMILY < CHIP_GFX12
818         s_barrier                                                               //LDS is used? wait for other waves in the same TG
819 #endif
820         s_and_b32       s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
821         s_cbranch_scc0  L_SAVE_LDS_DONE
822
823         // first wave do LDS save;
824
825         s_lshl_b32      s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
826         s_mov_b32       s_save_buf_rsrc2, s_save_alloc_size                     //NUM_RECORDS in bytes
827
828         // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
829         //
830         get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
831         get_svgpr_size_bytes(s_save_tmp)
832         s_add_u32       s_save_mem_offset, s_save_mem_offset, s_save_tmp
833         s_add_u32       s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
834         s_add_u32       s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
835
836         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
837
838         //load 0~63*4(byte address) to vgpr v0
839         v_mbcnt_lo_u32_b32      v0, -1, 0
840         v_mbcnt_hi_u32_b32      v0, -1, v0
841         v_mul_u32_u24   v0, 4, v0
842
843         s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
844         s_and_b32       m0, m0, 1
845         s_cmp_eq_u32    m0, 1
846         s_mov_b32       m0, 0x0
847         s_cbranch_scc1  L_SAVE_LDS_W64
848
849 L_SAVE_LDS_W32:
850 #if SAVE_AFTER_XNACK_ERROR
851         check_if_tcp_store_ok()
852         s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32
853
854 L_SAVE_LDS_LOOP_SQC_W32:
855         ds_read_b32     v1, v0
856         S_WAITCNT_0
857
858         write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
859
860         s_add_u32       m0, m0, 128                                             //every buffer_store_lds does 128 bytes
861         v_add_nc_u32    v0, v0, 128                                             //mem offset increased by 128 bytes
862         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
863         s_cbranch_scc1  L_SAVE_LDS_LOOP_SQC_W32                                 //LDS save is complete?
864
865         s_branch        L_SAVE_LDS_DONE
866
867 L_SAVE_LDS_WITH_TCP_W32:
868 #endif
869
870         s_mov_b32       s3, 128
871         s_nop           0
872         s_nop           0
873         s_nop           0
874 L_SAVE_LDS_LOOP_W32:
875         ds_read_b32     v1, v0
876         S_WAITCNT_0
877         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
878
879         s_add_u32       m0, m0, s3                                              //every buffer_store_lds does 128 bytes
880         s_add_u32       s_save_mem_offset, s_save_mem_offset, s3
881         v_add_nc_u32    v0, v0, 128                                             //mem offset increased by 128 bytes
882         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
883         s_cbranch_scc1  L_SAVE_LDS_LOOP_W32                                     //LDS save is complete?
884
885         s_branch        L_SAVE_LDS_DONE
886
887 L_SAVE_LDS_W64:
888 #if  SAVE_AFTER_XNACK_ERROR
889         check_if_tcp_store_ok()
890         s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64
891
892 L_SAVE_LDS_LOOP_SQC_W64:
893         ds_read_b32     v1, v0
894         S_WAITCNT_0
895
896         write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
897
898         s_add_u32       m0, m0, 256                                             //every buffer_store_lds does 256 bytes
899         v_add_nc_u32    v0, v0, 256                                             //mem offset increased by 256 bytes
900         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
901         s_cbranch_scc1  L_SAVE_LDS_LOOP_SQC_W64                                 //LDS save is complete?
902
903         s_branch        L_SAVE_LDS_DONE
904
905 L_SAVE_LDS_WITH_TCP_W64:
906 #endif
907
908         s_mov_b32       s3, 256
909         s_nop           0
910         s_nop           0
911         s_nop           0
912 L_SAVE_LDS_LOOP_W64:
913         ds_read_b32     v1, v0
914         S_WAITCNT_0
915         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
916
917         s_add_u32       m0, m0, s3                                              //every buffer_store_lds does 256 bytes
918         s_add_u32       s_save_mem_offset, s_save_mem_offset, s3
919         v_add_nc_u32    v0, v0, 256                                             //mem offset increased by 256 bytes
920         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc=(m0 < s_save_alloc_size) ? 1 : 0
921         s_cbranch_scc1  L_SAVE_LDS_LOOP_W64                                     //LDS save is complete?
922
923 L_SAVE_LDS_DONE:
924         /* save VGPRs  - set the Rest VGPRs */
925 L_SAVE_VGPR:
926         // VGPR SR memory offset: 0
927         s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
928         s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
929         s_and_b32       m0, m0, 1
930         s_cmp_eq_u32    m0, 1
931         s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI
932         s_mov_b32       s_save_mem_offset, (0+128*4)                            // for the rest VGPRs
933         s_mov_b32       exec_hi, 0x00000000
934         s_branch        L_SAVE_VGPR_NORMAL
935 L_ENABLE_SAVE_VGPR_EXEC_HI:
936         s_mov_b32       s_save_mem_offset, (0+256*4)                            // for the rest VGPRs
937         s_mov_b32       exec_hi, 0xFFFFFFFF
938 L_SAVE_VGPR_NORMAL:
939         s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
940         s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
941         s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                 //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
942         //determine it is wave32 or wave64
943         s_lshr_b32      m0, s_wave_size, S_WAVE_SIZE
944         s_and_b32       m0, m0, 1
945         s_cmp_eq_u32    m0, 1
946         s_cbranch_scc1  L_SAVE_VGPR_WAVE64
947
948         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
949
950         // VGPR Allocated in 4-GPR granularity
951
952         // VGPR store using dw burst
953         s_mov_b32       m0, 0x4                                                 //VGPR initial index value =4
954         s_cmp_lt_u32    m0, s_save_alloc_size
955         s_cbranch_scc0  L_SAVE_VGPR_END
956
957 #if  SAVE_AFTER_XNACK_ERROR
958         check_if_tcp_store_ok()
959         s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP
960
961 L_SAVE_VGPR_LOOP_SQC_W32:
962         v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
963         v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
964         v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
965         v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
966
967         write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
968
969         s_add_u32 m0, m0, 4
970         s_cmp_lt_u32 m0, s_save_alloc_size
971         s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32
972
973         s_branch L_SAVE_VGPR_END
974 #endif
975
976 L_SAVE_VGPR_W32_LOOP:
977         v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
978         v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
979         v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
980         v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
981
982         buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
983         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128
984         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2
985         buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3
986
987         s_add_u32       m0, m0, 4                                               //next vgpr index
988         s_add_u32       s_save_mem_offset, s_save_mem_offset, 128*4             //every buffer_store_dword does 128 bytes
989         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
990         s_cbranch_scc1  L_SAVE_VGPR_W32_LOOP                                    //VGPR save is complete?
991
992         s_branch        L_SAVE_VGPR_END
993
994 L_SAVE_VGPR_WAVE64:
995         s_mov_b32       s_save_buf_rsrc2, 0x1000000                             //NUM_RECORDS in bytes
996
997         // VGPR store using dw burst
998         s_mov_b32       m0, 0x4                                                 //VGPR initial index value =4
999         s_cmp_lt_u32    m0, s_save_alloc_size
1000         s_cbranch_scc0  L_SAVE_SHARED_VGPR
1001
1002 #if  SAVE_AFTER_XNACK_ERROR
1003         check_if_tcp_store_ok()
1004         s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP
1005
1006 L_SAVE_VGPR_LOOP_SQC_W64:
1007         v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
1008         v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
1009         v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
1010         v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
1011
1012         write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
1013
1014         s_add_u32 m0, m0, 4
1015         s_cmp_lt_u32 m0, s_save_alloc_size
1016         s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64
1017
1018         s_branch L_SAVE_VGPR_END
1019 #endif
1020
1021 L_SAVE_VGPR_W64_LOOP:
1022         v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
1023         v_movrels_b32   v1, v1                                                  //v1 = v[1+m0]
1024         v_movrels_b32   v2, v2                                                  //v2 = v[2+m0]
1025         v_movrels_b32   v3, v3                                                  //v3 = v[3+m0]
1026
1027         buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
1028         buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256
1029         buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2
1030         buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3
1031
1032         s_add_u32       m0, m0, 4                                               //next vgpr index
1033         s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4             //every buffer_store_dword does 256 bytes
1034         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
1035         s_cbranch_scc1  L_SAVE_VGPR_W64_LOOP                                    //VGPR save is complete?
1036
1037 L_SAVE_SHARED_VGPR:
1038         //Below part will be the save shared vgpr part (new for gfx10)
1039         s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1040         s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF        //shared_vgpr_size is zero?
1041         s_cbranch_scc0  L_SAVE_VGPR_END                                         //no shared_vgpr used? jump to L_SAVE_LDS
1042         s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 3                 //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
1043         //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
1044         //save shared_vgpr will start from the index of m0
1045         s_add_u32       s_save_alloc_size, s_save_alloc_size, m0
1046         s_mov_b32       exec_lo, 0xFFFFFFFF
1047         s_mov_b32       exec_hi, 0x00000000
1048
1049 #if  SAVE_AFTER_XNACK_ERROR
1050         check_if_tcp_store_ok()
1051         s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP
1052
1053 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC:
1054         v_movrels_b32   v0, v0
1055
1056         write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
1057
1058         s_add_u32 m0, m0, 1
1059         s_cmp_lt_u32 m0, s_save_alloc_size
1060         s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC
1061
1062         s_branch L_SAVE_VGPR_END
1063 #endif
1064
1065 L_SAVE_SHARED_VGPR_WAVE64_LOOP:
1066         v_movrels_b32   v0, v0                                                  //v0 = v[0+m0]
1067         buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
1068         s_add_u32       m0, m0, 1                                               //next vgpr index
1069         s_add_u32       s_save_mem_offset, s_save_mem_offset, 128
1070         s_cmp_lt_u32    m0, s_save_alloc_size                                   //scc = (m0 < s_save_alloc_size) ? 1 : 0
1071         s_cbranch_scc1  L_SAVE_SHARED_VGPR_WAVE64_LOOP                          //SHARED_VGPR save is complete?
1072
1073 L_SAVE_VGPR_END:
1074         s_branch        L_END_PGM
1075
1076 L_RESTORE:
1077         /* Setup Resource Contants */
1078         s_mov_b32       s_restore_buf_rsrc0, s_restore_spi_init_lo              //base_addr_lo
1079         s_and_b32       s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF  //base_addr_hi
1080         s_or_b32        s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
1081         s_mov_b32       s_restore_buf_rsrc2, 0                                  //NUM_RECORDS initial value = 0 (in bytes)
1082         s_mov_b32       s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
1083
1084 #if ASIC_FAMILY >= CHIP_GFX12
1085         // Save s_restore_spi_init_hi for later use.
1086         s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
1087 #endif
1088
1089         //determine it is wave32 or wave64
1090         get_wave_size2(s_restore_size)
1091
1092         s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1093         s_cbranch_scc0  L_RESTORE_VGPR
1094
1095         /* restore LDS */
1096 L_RESTORE_LDS:
1097         s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
1098         s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
1099         s_and_b32       m0, m0, 1
1100         s_cmp_eq_u32    m0, 1
1101         s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI
1102         s_mov_b32       exec_hi, 0x00000000
1103         s_branch        L_RESTORE_LDS_NORMAL
1104 L_ENABLE_RESTORE_LDS_EXEC_HI:
1105         s_mov_b32       exec_hi, 0xFFFFFFFF
1106 L_RESTORE_LDS_NORMAL:
1107         s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
1108         s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //lds_size is zero?
1109         s_cbranch_scc0  L_RESTORE_VGPR                                          //no lds used? jump to L_RESTORE_VGPR
1110         s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
1111         s_mov_b32       s_restore_buf_rsrc2, s_restore_alloc_size               //NUM_RECORDS in bytes
1112
1113         // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
1114         //
1115         get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1116         get_svgpr_size_bytes(s_restore_tmp)
1117         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1118         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1119         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
1120
1121         s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
1122
1123         s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
1124         s_and_b32       m0, m0, 1
1125         s_cmp_eq_u32    m0, 1
1126         s_mov_b32       m0, 0x0
1127         s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
1128
1129 L_RESTORE_LDS_LOOP_W32:
1130 #if HAVE_BUFFER_LDS_LOAD
1131         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
1132 #else
1133         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
1134         S_WAITCNT_0
1135         ds_store_addtid_b32     v0
1136 #endif
1137         s_add_u32       m0, m0, 128                                             // 128 DW
1138         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128         //mem offset increased by 128DW
1139         s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
1140         s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32                                  //LDS restore is complete?
1141         s_branch        L_RESTORE_VGPR
1142
1143 L_RESTORE_LDS_LOOP_W64:
1144 #if HAVE_BUFFER_LDS_LOAD
1145         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
1146 #else
1147         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
1148         S_WAITCNT_0
1149         ds_store_addtid_b32     v0
1150 #endif
1151         s_add_u32       m0, m0, 256                                             // 256 DW
1152         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256         //mem offset increased by 256DW
1153         s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc=(m0 < s_restore_alloc_size) ? 1 : 0
1154         s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64                                  //LDS restore is complete?
1155
1156         /* restore VGPRs */
1157 L_RESTORE_VGPR:
1158         // VGPR SR memory offset : 0
1159         s_mov_b32       s_restore_mem_offset, 0x0
1160         s_mov_b32       exec_lo, 0xFFFFFFFF                                     //need every thread from now on
1161         s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
1162         s_and_b32       m0, m0, 1
1163         s_cmp_eq_u32    m0, 1
1164         s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI
1165         s_mov_b32       exec_hi, 0x00000000
1166         s_branch        L_RESTORE_VGPR_NORMAL
1167 L_ENABLE_RESTORE_VGPR_EXEC_HI:
1168         s_mov_b32       exec_hi, 0xFFFFFFFF
1169 L_RESTORE_VGPR_NORMAL:
1170         s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1171         s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
1172         s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
1173         //determine it is wave32 or wave64
1174         s_lshr_b32      m0, s_restore_size, S_WAVE_SIZE
1175         s_and_b32       m0, m0, 1
1176         s_cmp_eq_u32    m0, 1
1177         s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
1178
1179         s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
1180
1181         // VGPR load using dw burst
1182         s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset         // restore start with v1, v0 will be the last
1183         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128*4
1184         s_mov_b32       m0, 4                                                   //VGPR initial index value = 4
1185         s_cmp_lt_u32    m0, s_restore_alloc_size
1186         s_cbranch_scc0  L_RESTORE_SGPR
1187
1188 L_RESTORE_VGPR_WAVE32_LOOP:
1189         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1190         buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128
1191         buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2
1192         buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3
1193         S_WAITCNT_0
1194         v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
1195         v_movreld_b32   v1, v1
1196         v_movreld_b32   v2, v2
1197         v_movreld_b32   v3, v3
1198         s_add_u32       m0, m0, 4                                               //next vgpr index
1199         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128*4       //every buffer_load_dword does 128 bytes
1200         s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1201         s_cbranch_scc1  L_RESTORE_VGPR_WAVE32_LOOP                              //VGPR restore (except v0) is complete?
1202
1203         /* VGPR restore on v0 */
1204         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
1205         buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128
1206         buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2
1207         buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3
1208         S_WAITCNT_0
1209
1210         s_branch        L_RESTORE_SGPR
1211
1212 L_RESTORE_VGPR_WAVE64:
1213         s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
1214
1215         // VGPR load using dw burst
1216         s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset         // restore start with v4, v0 will be the last
1217         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
1218         s_mov_b32       m0, 4                                                   //VGPR initial index value = 4
1219         s_cmp_lt_u32    m0, s_restore_alloc_size
1220         s_cbranch_scc0  L_RESTORE_SHARED_VGPR
1221
1222 L_RESTORE_VGPR_WAVE64_LOOP:
1223         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1224         buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256
1225         buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2
1226         buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3
1227         S_WAITCNT_0
1228         v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
1229         v_movreld_b32   v1, v1
1230         v_movreld_b32   v2, v2
1231         v_movreld_b32   v3, v3
1232         s_add_u32       m0, m0, 4                                               //next vgpr index
1233         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4       //every buffer_load_dword does 256 bytes
1234         s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1235         s_cbranch_scc1  L_RESTORE_VGPR_WAVE64_LOOP                              //VGPR restore (except v0) is complete?
1236
1237 L_RESTORE_SHARED_VGPR:
1238         //Below part will be the restore shared vgpr part (new for gfx10)
1239         s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)  //shared_vgpr_size
1240         s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF  //shared_vgpr_size is zero?
1241         s_cbranch_scc0  L_RESTORE_V0                                            //no shared_vgpr used?
1242         s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 3           //Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
1243         //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
1244         //restore shared_vgpr will start from the index of m0
1245         s_add_u32       s_restore_alloc_size, s_restore_alloc_size, m0
1246         s_mov_b32       exec_lo, 0xFFFFFFFF
1247         s_mov_b32       exec_hi, 0x00000000
1248 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
1249         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
1250         S_WAITCNT_0
1251         v_movreld_b32   v0, v0                                                  //v[0+m0] = v0
1252         s_add_u32       m0, m0, 1                                               //next vgpr index
1253         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 128
1254         s_cmp_lt_u32    m0, s_restore_alloc_size                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1255         s_cbranch_scc1  L_RESTORE_SHARED_VGPR_WAVE64_LOOP                       //VGPR restore (except v0) is complete?
1256
1257         s_mov_b32       exec_hi, 0xFFFFFFFF                                     //restore back exec_hi before restoring V0!!
1258
1259         /* VGPR restore on v0 */
1260 L_RESTORE_V0:
1261         buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE
1262         buffer_load_dword       v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256
1263         buffer_load_dword       v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2
1264         buffer_load_dword       v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3
1265         S_WAITCNT_0
1266
1267         /* restore SGPRs */
1268         //will be 2+8+16*6
1269         // SGPR SR memory offset : size(VGPR)+size(SVGPR)
1270 L_RESTORE_SGPR:
1271         get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1272         get_svgpr_size_bytes(s_restore_tmp)
1273         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1274         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1275         s_sub_u32       s_restore_mem_offset, s_restore_mem_offset, 20*4        //s108~s127 is not saved
1276
1277         s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
1278
1279         s_mov_b32       m0, s_sgpr_save_num
1280
1281         read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1282         S_WAITCNT_0
1283
1284         s_sub_u32       m0, m0, 4                                               // Restore from S[0] to S[104]
1285         s_nop           0                                                       // hazard SALU M0=> S_MOVREL
1286
1287         s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
1288         s_movreld_b64   s2, s2
1289
1290         read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1291         S_WAITCNT_0
1292
1293         s_sub_u32       m0, m0, 8                                               // Restore from S[0] to S[96]
1294         s_nop           0                                                       // hazard SALU M0=> S_MOVREL
1295
1296         s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
1297         s_movreld_b64   s2, s2
1298         s_movreld_b64   s4, s4
1299         s_movreld_b64   s6, s6
1300
1301  L_RESTORE_SGPR_LOOP:
1302         read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
1303         S_WAITCNT_0
1304
1305         s_sub_u32       m0, m0, 16                                              // Restore from S[n] to S[0]
1306         s_nop           0                                                       // hazard SALU M0=> S_MOVREL
1307
1308         s_movreld_b64   s0, s0                                                  //s[0+m0] = s0
1309         s_movreld_b64   s2, s2
1310         s_movreld_b64   s4, s4
1311         s_movreld_b64   s6, s6
1312         s_movreld_b64   s8, s8
1313         s_movreld_b64   s10, s10
1314         s_movreld_b64   s12, s12
1315         s_movreld_b64   s14, s14
1316
1317         s_cmp_eq_u32    m0, 0                                                   //scc = (m0 < s_sgpr_save_num) ? 1 : 0
1318         s_cbranch_scc0  L_RESTORE_SGPR_LOOP
1319
1320         // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception.
1321         // Clear DEBUG_EN before and restore MODE after the barrier.
1322         s_setreg_imm32_b32      hwreg(HW_REG_MODE), 0
1323 #if ASIC_FAMILY < CHIP_GFX12
1324         s_barrier                                                               //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
1325 #endif
1326
1327         /* restore HW registers */
1328 L_RESTORE_HWREG:
1329         // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
1330         get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1331         get_svgpr_size_bytes(s_restore_tmp)
1332         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1333         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1334
1335         s_mov_b32       s_restore_buf_rsrc2, 0x1000000                          //NUM_RECORDS in bytes
1336
1337 #if ASIC_FAMILY >= CHIP_GFX12
1338         // Restore s_restore_spi_init_hi before the saved value gets clobbered.
1339         s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
1340 #endif
1341
1342         read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
1343         read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1344         read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1345         read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1346         read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1347         read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
1348         read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
1349         read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
1350         read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
1351         read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1352         S_WAITCNT_0
1353
1354         s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
1355
1356         read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1357         S_WAITCNT_0
1358
1359         s_setreg_b32    hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
1360
1361 #if ASIC_FAMILY >= CHIP_GFX12
1362         read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1363         S_WAITCNT_0
1364         s_setreg_b32    hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
1365
1366         read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1367         S_WAITCNT_0
1368         s_setreg_b32    hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
1369
1370         // Only the first wave needs to restore the workgroup barrier.
1371         s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
1372         s_cbranch_scc0  L_SKIP_BARRIER_RESTORE
1373
1374         // Skip over WAVE_STATUS, since there is no state to restore from it
1375         s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 4
1376
1377         read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
1378         S_WAITCNT_0
1379
1380         s_bitcmp1_b32   s_restore_tmp, BARRIER_STATE_VALID_OFFSET
1381         s_cbranch_scc0  L_SKIP_BARRIER_RESTORE
1382
1383         // extract the saved signal count from s_restore_tmp
1384         s_lshr_b32      s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
1385
1386         // We need to call s_barrier_signal repeatedly to restore the signal
1387         // count of the work group barrier.  The member count is already
1388         // initialized with the number of waves in the work group.
1389 L_BARRIER_RESTORE_LOOP:
1390         s_and_b32       s_restore_tmp, s_restore_tmp, s_restore_tmp
1391         s_cbranch_scc0  L_SKIP_BARRIER_RESTORE
1392         s_barrier_signal        -1
1393         s_add_i32       s_restore_tmp, s_restore_tmp, -1
1394         s_branch        L_BARRIER_RESTORE_LOOP
1395
1396 L_SKIP_BARRIER_RESTORE:
1397 #endif
1398
1399         s_mov_b32       m0, s_restore_m0
1400         s_mov_b32       exec_lo, s_restore_exec_lo
1401         s_mov_b32       exec_hi, s_restore_exec_hi
1402
1403 #if HAVE_XNACK
1404         s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
1405 #endif
1406
1407         // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed.
1408         // Only restore the other fields to avoid clobbering them.
1409         s_setreg_b32    hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts
1410         s_lshr_b32      s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT
1411         s_setreg_b32    hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts
1412
1413 if S_TRAPSTS_RESTORE_PART_3_SIZE > 0
1414         s_lshr_b32      s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT
1415         s_setreg_b32    hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts
1416 end
1417
1418         s_setreg_b32    hwreg(HW_REG_MODE), s_restore_mode
1419
1420         // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1421         // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1422         get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1423         get_svgpr_size_bytes(s_restore_ttmps_hi)
1424         s_add_u32       s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1425         s_add_u32       s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1426         s_add_u32       s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1427         s_addc_u32      s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1428         s_and_b32       s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1429         s_load_dwordx4  [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE
1430         s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE
1431         s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE
1432         S_WAITCNT_0
1433
1434 #if HAVE_XNACK
1435         restore_ib_sts(s_restore_tmp, s_restore_m0)
1436 #endif
1437
1438         s_and_b32       s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff            //pc[47:32] //Do it here in order not to affect STATUS
1439         s_and_b64       exec, exec, exec                                        // Restore STATUS.EXECZ, not writable by s_setreg_b32
1440         s_and_b64       vcc, vcc, vcc                                           // Restore STATUS.VCCZ, not writable by s_setreg_b32
1441
1442 #if SW_SA_TRAP
1443         // If traps are enabled then return to the shader with PRIV=0.
1444         // Otherwise retain PRIV=1 for subsequent context save requests.
1445         s_getreg_b32    s_restore_tmp, hwreg(HW_REG_STATUS)
1446         s_bitcmp1_b32   s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT
1447         s_cbranch_scc1  L_RETURN_WITHOUT_PRIV
1448
1449         s_setreg_b32    hwreg(HW_REG_STATUS), s_restore_status                  // SCC is included, which is changed by previous salu
1450         s_setpc_b64     [s_restore_pc_lo, s_restore_pc_hi]
1451 L_RETURN_WITHOUT_PRIV:
1452 #endif
1453
1454         s_setreg_b32    hwreg(S_STATUS_HWREG), s_restore_status                 // SCC is included, which is changed by previous salu
1455
1456 #if ASIC_FAMILY >= CHIP_GFX12
1457         // Make barrier and LDS state visible to all waves in the group.
1458         // STATE_PRIV.BARRIER_COMPLETE may change after this point.
1459         s_barrier_signal        -2
1460         s_barrier_wait  -2
1461 #endif
1462
1463         s_rfe_b64       s_restore_pc_lo                                         //Return to the main shader program and resume execution
1464
1465 L_END_PGM:
1466         s_endpgm_saved
1467 end
1468
1469 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1470 #if NO_SQC_STORE
1471         // Copy into VGPR for later TCP store.
1472         v_writelane_b32 v2, s, m0
1473         s_add_u32       m0, m0, 0x1
1474 #else
1475         s_mov_b32       exec_lo, m0
1476         s_mov_b32       m0, s_mem_offset
1477         s_buffer_store_dword    s, s_rsrc, m0 S_COHERENCE
1478         s_add_u32       s_mem_offset, s_mem_offset, 4
1479         s_mov_b32       m0, exec_lo
1480 #endif
1481 end
1482
1483
1484 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1485 #if NO_SQC_STORE
1486         // Copy into VGPR for later TCP store.
1487         for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1488                 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1489                 s_add_u32       ttmp13, ttmp13, 0x1
1490         end
1491 #else
1492         s_buffer_store_dwordx4  s[0], s_rsrc, 0 S_COHERENCE
1493         s_buffer_store_dwordx4  s[4], s_rsrc, 16 S_COHERENCE
1494         s_buffer_store_dwordx4  s[8], s_rsrc, 32 S_COHERENCE
1495         s_buffer_store_dwordx4  s[12], s_rsrc, 48 S_COHERENCE
1496         s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
1497         s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0
1498 #endif
1499 end
1500
1501 function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
1502 #if NO_SQC_STORE
1503         // Copy into VGPR for later TCP store.
1504         for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1505                 v_writelane_b32 v2, s[sgpr_idx], ttmp13
1506                 s_add_u32       ttmp13, ttmp13, 0x1
1507         end
1508 #else
1509         s_buffer_store_dwordx4  s[0], s_rsrc, 0 S_COHERENCE
1510         s_buffer_store_dwordx4  s[4], s_rsrc, 16 S_COHERENCE
1511         s_buffer_store_dwordx4  s[8], s_rsrc, 32 S_COHERENCE
1512         s_add_u32       s_rsrc[0], s_rsrc[0], 4*12
1513         s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0
1514 #endif
1515 end
1516
1517 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1518         s_buffer_load_dword     s, s_rsrc, s_mem_offset S_COHERENCE
1519         s_add_u32       s_mem_offset, s_mem_offset, 4
1520 end
1521
1522 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1523         s_sub_u32       s_mem_offset, s_mem_offset, 4*16
1524         s_buffer_load_dwordx16  s, s_rsrc, s_mem_offset S_COHERENCE
1525 end
1526
1527 function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1528         s_sub_u32       s_mem_offset, s_mem_offset, 4*8
1529         s_buffer_load_dwordx8   s, s_rsrc, s_mem_offset S_COHERENCE
1530 end
1531
1532 function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1533         s_sub_u32       s_mem_offset, s_mem_offset, 4*4
1534         s_buffer_load_dwordx4   s, s_rsrc, s_mem_offset S_COHERENCE
1535 end
1536
1537 #if SAVE_AFTER_XNACK_ERROR
1538 function check_if_tcp_store_ok
1539         // If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
1540         s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
1541         s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
1542
1543 L_TCP_STORE_CHECK_DONE:
1544 end
1545
1546 function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset)
1547         s_mov_b32 s4, 0
1548
1549 L_WRITE_VGPR_LANE_LOOP:
1550         for var lane = 0; lane < 4; ++lane
1551                 v_readlane_b32 s[lane], vgpr, s4
1552                 s_add_u32 s4, s4, 1
1553         end
1554
1555         s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
1556
1557         s_add_u32 s_mem_offset, s_mem_offset, 0x10
1558         s_cmp_eq_u32 s4, n_lanes
1559         s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
1560 end
1561
1562 function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
1563         for var vgpr = 0; vgpr < n_vgprs; ++vgpr
1564                 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset)
1565         end
1566 end
1567
1568 function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
1569         for var vgpr = 0; vgpr < n_vgprs; ++vgpr
1570                 write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset)
1571         end
1572 end
1573 #endif
1574
1575 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1576         s_getreg_b32    s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1577         s_add_u32       s_vgpr_size_byte, s_vgpr_size_byte, 1
1578         s_bitcmp1_b32   s_size, S_WAVE_SIZE
1579         s_cbranch_scc1  L_ENABLE_SHIFT_W64
1580         s_lshl_b32      s_vgpr_size_byte, s_vgpr_size_byte, (2+7)               //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
1581         s_branch        L_SHIFT_DONE
1582 L_ENABLE_SHIFT_W64:
1583         s_lshl_b32      s_vgpr_size_byte, s_vgpr_size_byte, (2+8)               //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
1584 L_SHIFT_DONE:
1585 end
1586
1587 function get_svgpr_size_bytes(s_svgpr_size_byte)
1588         s_getreg_b32    s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1589         s_lshl_b32      s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1590 end
1591
1592 function get_sgpr_size_bytes
1593         return 512
1594 end
1595
1596 function get_hwreg_size_bytes
1597         return 128
1598 end
1599
1600 function get_wave_size2(s_reg)
1601 #if ASIC_FAMILY < CHIP_GFX12
1602         s_getreg_b32    s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
1603 #else
1604         s_getreg_b32    s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
1605 #endif
1606         s_lshl_b32      s_reg, s_reg, S_WAVE_SIZE
1607 end
1608
1609 #if HAVE_XNACK
1610 function save_and_clear_ib_sts(tmp1, tmp2)
1611         // Preserve and clear scalar XNACK state before issuing scalar loads.
1612         // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
1613         // unused space ttmp11[31:24].
1614         s_andn2_b32     ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
1615         s_getreg_b32    tmp1, hwreg(HW_REG_IB_STS)
1616         s_and_b32       tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1617         s_lshl_b32      tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1618         s_or_b32        ttmp11, ttmp11, tmp2
1619         s_and_b32       tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1620         s_lshl_b32      tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1621         s_or_b32        ttmp11, ttmp11, tmp2
1622         s_andn2_b32     tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
1623         s_setreg_b32    hwreg(HW_REG_IB_STS), tmp1
1624 end
1625
1626 function restore_ib_sts(tmp1, tmp2)
1627         s_lshr_b32      tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1628         s_and_b32       tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1629         s_lshr_b32      tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1630         s_and_b32       tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1631         s_or_b32        tmp1, tmp1, tmp2
1632         s_setreg_b32    hwreg(HW_REG_IB_STS), tmp1
1633 end
1634 #endif
This page took 0.130717 seconds and 4 git commands to generate.