]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
Merge tag 'drm-for-v4.16' of git://people.freedesktop.org/~airlied/linux
[linux.git] / drivers / gpu / drm / amd / amdkfd / cwsr_trap_handler_gfx8.asm
1 /*
2  * Copyright 2015-2017 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23 #if 0
24 HW (VI) source code for CWSR trap handler
25 #Version 18 + multiple trap handler
26
27 // this performance-optimal version was originally from Seven Xu at SRDC
28
29 // Revison #18   --...
30 /* Rev History
31 ** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
32 ** #4. SR Memory Layout:
33 **             1. VGPR-SGPR-HWREG-{LDS}
34 **             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
35 ** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
36 ** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
37 ** #7. Update: 1. don't barrier if noLDS
38 ** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
39 **             2. Fix SQ issue by s_sleep 2
40 ** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
41 **             2. optimize s_buffer save by burst 16sgprs...
42 ** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
43 ** #11. Update 1. Add 2 more timestamp for debug version
44 ** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
45 ** #13. Integ  1. Always use MUBUF for PV trap shader...
46 ** #14. Update 1. s_buffer_store soft clause...
47 ** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
48 ** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
49 ** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
50 **             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
51 ** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
52 **             2. FUNC - Handle non-CWSR traps
53 */
54
55 var G8SR_WDMEM_HWREG_OFFSET = 0
56 var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
57
58 // Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
59
60 var G8SR_DEBUG_TIMESTAMP = 0
61 var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
62 var s_g8sr_ts_save_s    = s[34:35]   // save start
63 var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
64 var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
65 var s_g8sr_ts_save_d    = s[40:41]   // save end
66 var s_g8sr_ts_restore_s = s[42:43]   // restore start
67 var s_g8sr_ts_restore_d = s[44:45]   // restore end
68
69 var G8SR_VGPR_SR_IN_DWX4 = 0
70 var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
71 var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
72
73
74 /*************************************************************************/
75 /*                  control on how to run the shader                     */
76 /*************************************************************************/
77 //any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
78 var EMU_RUN_HACK                    =   0
79 var EMU_RUN_HACK_RESTORE_NORMAL     =   0
80 var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
81 var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
82 var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
83 var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
84 var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
85 var SAVE_LDS                        =   1
86 var WG_BASE_ADDR_LO                 =   0x9000a000
87 var WG_BASE_ADDR_HI                 =   0x0
88 var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
89 var CTX_SAVE_CONTROL                =   0x0
90 var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
91 var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
92 var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
93 var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
94 var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
95
96 /**************************************************************************/
97 /*                      variables                                         */
98 /**************************************************************************/
99 var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
100 var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
101 var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
102
103 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
104 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
105 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
106 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
107 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
108 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
109
110 var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
111 var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
112 var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
113 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
114 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
115 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
116 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
117 var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
118 var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
119 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
120 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
121
122 var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
123 var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
124 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
125 var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
126 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
127
128 var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
129 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
130
131
132 /*      Save        */
133 var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
134 var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
135
136 var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
137 var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
138 var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
139 var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
140 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
141 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
142
143 var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
144 var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
145 var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
146 var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
147
148 var s_save_spi_init_lo              =   exec_lo
149 var s_save_spi_init_hi              =   exec_hi
150
151                                                 //tba_lo and tba_hi need to be saved/restored
152 var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
153 var s_save_pc_hi            =   ttmp1
154 var s_save_exec_lo          =   ttmp2
155 var s_save_exec_hi          =   ttmp3
156 var s_save_status           =   ttmp4
157 var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
158 var s_save_xnack_mask_lo    =   ttmp6
159 var s_save_xnack_mask_hi    =   ttmp7
160 var s_save_buf_rsrc0        =   ttmp8
161 var s_save_buf_rsrc1        =   ttmp9
162 var s_save_buf_rsrc2        =   ttmp10
163 var s_save_buf_rsrc3        =   ttmp11
164
165 var s_save_mem_offset       =   tma_lo
166 var s_save_alloc_size       =   s_save_trapsts          //conflict
167 var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
168 var s_save_m0               =   tma_hi
169
170 /*      Restore     */
171 var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
172 var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
173
174 var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
175 var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
176 var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
177 var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
178 var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
179 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
180
181 var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
182 var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
183 var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
184 var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
185
186 var s_restore_spi_init_lo                   =   exec_lo
187 var s_restore_spi_init_hi                   =   exec_hi
188
189 var s_restore_mem_offset        =   ttmp2
190 var s_restore_alloc_size        =   ttmp3
191 var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
192 var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
193
194 var s_restore_m0            =   s_restore_alloc_size    //no conflict
195
196 var s_restore_mode          =   ttmp7
197
198 var s_restore_pc_lo         =   ttmp0
199 var s_restore_pc_hi         =   ttmp1
200 var s_restore_exec_lo       =   tma_lo                  //no conflict
201 var s_restore_exec_hi       =   tma_hi                  //no conflict
202 var s_restore_status        =   ttmp4
203 var s_restore_trapsts       =   ttmp5
204 var s_restore_xnack_mask_lo =   xnack_mask_lo
205 var s_restore_xnack_mask_hi =   xnack_mask_hi
206 var s_restore_buf_rsrc0     =   ttmp8
207 var s_restore_buf_rsrc1     =   ttmp9
208 var s_restore_buf_rsrc2     =   ttmp10
209 var s_restore_buf_rsrc3     =   ttmp11
210
211 /**************************************************************************/
212 /*                      trap handler entry points                         */
213 /**************************************************************************/
214 /* Shader Main*/
215
216 shader main
217   asic(VI)
218   type(CS)
219
220
221     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
222         //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
223         s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
224         s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
225         s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
226         //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
227         s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
228     else
229         s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
230     end
231
232 L_JUMP_TO_RESTORE:
233     s_branch L_RESTORE                                              //restore
234
235 L_SKIP_RESTORE:
236
237     s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
238     s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
239     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
240     s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
241     s_cbranch_scc1  L_SAVE                                      //this is the operation for save
242
243     // *********    Handle non-CWSR traps       *******************
244 if (!EMU_RUN_HACK)
245     /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
246     s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
247     s_waitcnt lgkmcnt(0)
248     s_or_b32        ttmp7, ttmp8, ttmp9
249     s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
250     s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
251     s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
252
253 L_NO_NEXT_TRAP:
254     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
255     s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
256     s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
257     s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
258     s_addc_u32  ttmp1, ttmp1, 0
259 L_EXCP_CASE:
260     s_and_b32   ttmp1, ttmp1, 0xFFFF
261     s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
262     s_rfe_b64       [ttmp0, ttmp1]
263 end
264     // *********        End handling of non-CWSR traps   *******************
265
266 /**************************************************************************/
267 /*                      save routine                                      */
268 /**************************************************************************/
269
270 L_SAVE:
271
272 if G8SR_DEBUG_TIMESTAMP
273         s_memrealtime   s_g8sr_ts_save_s
274         s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
275 end
276
277     //check whether there is mem_viol
278     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
279     s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
280     s_cbranch_scc0  L_NO_PC_REWIND
281
282     //if so, need rewind PC assuming GDS operation gets NACKed
283     s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
284     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
285     s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
286     s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
287     s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
288
289 L_NO_PC_REWIND:
290     s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
291     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
292
293     s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
294     s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
295     s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
296     s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
297     s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
298     s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
299     s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
300     s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
301     s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
302     s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
303
304     s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
305
306     /*      inform SPI the readiness and wait for SPI's go signal */
307     s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
308     s_mov_b32       s_save_exec_hi, exec_hi
309     s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
310
311 if G8SR_DEBUG_TIMESTAMP
312         s_memrealtime  s_g8sr_ts_sq_save_msg
313         s_waitcnt lgkmcnt(0)
314 end
315
316     if (EMU_RUN_HACK)
317
318     else
319         s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
320     end
321
322   L_SLEEP:
323     s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
324
325     if (EMU_RUN_HACK)
326
327     else
328         s_cbranch_execz L_SLEEP
329     end
330
331 if G8SR_DEBUG_TIMESTAMP
332         s_memrealtime  s_g8sr_ts_spi_wrexec
333         s_waitcnt lgkmcnt(0)
334 end
335
336     /*      setup Resource Contants    */
337     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
338         //calculate wd_addr using absolute thread id
339         v_readlane_b32 s_save_tmp, v9, 0
340         s_lshr_b32 s_save_tmp, s_save_tmp, 6
341         s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
342         s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
343         s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
344         s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
345     else
346     end
347     if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
348         s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
349         s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
350         s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
351     else
352     end
353
354
355     s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
356     s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
357     s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
358     s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
359     s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
360     s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
361     s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
362     s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
363     s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
364     s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
365     s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
366
367     //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
368     s_mov_b32       s_save_m0,          m0                                                                  //save M0
369
370     /*      global mem offset           */
371     s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
372
373
374
375
376     /*      save HW registers   */
377     //////////////////////////////
378
379   L_SAVE_HWREG:
380         // HWREG SR memory offset : size(VGPR)+size(SGPR)
381        get_vgpr_size_bytes(s_save_mem_offset)
382        get_sgpr_size_bytes(s_save_tmp)
383        s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
384
385
386     s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
387     if (SWIZZLE_EN)
388         s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
389     else
390         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
391     end
392
393
394     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
395
396     if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
397         s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
398         s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
399         s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
400         s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
401     end
402
403     write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
404     write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
405     write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
406     write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
407     write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
408
409     //s_save_trapsts conflicts with s_save_alloc_size
410     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
411     write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
412
413     write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
414     write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
415
416     //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
417     s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
418     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
419     write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
420     write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
421
422
423
424     /*      the first wave in the threadgroup    */
425         // save fist_wave bits in tba_hi unused bit.26
426     s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
427     //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
428     s_mov_b32        s_save_exec_hi, 0x0
429     s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
430
431
432     /*          save SGPRs      */
433         // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
434     //////////////////////////////
435
436     // SGPR SR memory offset : size(VGPR)
437     get_vgpr_size_bytes(s_save_mem_offset)
438     // TODO, change RSRC word to rearrange memory layout for SGPRS
439
440     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
441     s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
442     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
443
444     if (SGPR_SAVE_USE_SQC)
445         s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
446     else
447         s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
448     end
449
450     if (SWIZZLE_EN)
451         s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
452     else
453         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
454     end
455
456
457     // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
458     //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
459     s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
460     s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
461     s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
462
463     s_mov_b32       m0, 0x0                         //SGPR initial index value =0
464   L_SAVE_SGPR_LOOP:
465     // SGPR is allocated in 16 SGPR granularity
466     s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
467     s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
468     s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
469     s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
470     s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
471     s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
472     s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
473     s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
474
475     write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
476     s_add_u32       m0, m0, 16                                                      //next sgpr index
477     s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
478     s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
479     // restore s_save_buf_rsrc0,1
480     //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
481     s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
482
483
484
485
486     /*          save first 4 VGPR, then LDS save could use   */
487         // each wave will alloc 4 vgprs at least...
488     /////////////////////////////////////////////////////////////////////////////////////
489
490     s_mov_b32       s_save_mem_offset, 0
491     s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
492     s_mov_b32       exec_hi, 0xFFFFFFFF
493
494     if (SWIZZLE_EN)
495         s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
496     else
497         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
498     end
499
500
501     // VGPR Allocated in 4-GPR granularity
502
503 if G8SR_VGPR_SR_IN_DWX4
504         // the const stride for DWx4 is 4*4 bytes
505         s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
506         s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
507
508         buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
509
510         s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
511         s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
512 else
513         buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
514         buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
515         buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
516         buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
517 end
518
519
520
521     /*          save LDS        */
522     //////////////////////////////
523
524   L_SAVE_LDS:
525
526         // Change EXEC to all threads...
527     s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
528     s_mov_b32       exec_hi, 0xFFFFFFFF
529
530     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
531     s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
532     s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
533
534     s_barrier               //LDS is used? wait for other waves in the same TG
535     //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
536     s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
537     s_cbranch_scc0  L_SAVE_LDS_DONE
538
539         // first wave do LDS save;
540
541     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
542     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
543     s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
544
545     // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
546     //
547     get_vgpr_size_bytes(s_save_mem_offset)
548     get_sgpr_size_bytes(s_save_tmp)
549     s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
550     s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
551
552
553     if (SWIZZLE_EN)
554         s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
555     else
556         s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
557     end
558
559     s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
560
561
562 var LDS_DMA_ENABLE = 0
563 var UNROLL = 0
564 if UNROLL==0 && LDS_DMA_ENABLE==1
565         s_mov_b32  s3, 256*2
566         s_nop 0
567         s_nop 0
568         s_nop 0
569   L_SAVE_LDS_LOOP:
570         //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
571     if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
572             buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
573             buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
574     end
575
576     s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
577     s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
578     s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
579     s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
580
581 elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
582       // store from higest LDS address to lowest
583       s_mov_b32  s3, 256*2
584       s_sub_u32  m0, s_save_alloc_size, s3
585       s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
586       s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
587       s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
588       s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
589       s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
590       s_nop 0
591       s_nop 0
592       s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
593       s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
594       s_add_u32   s0, s0,s_save_alloc_size
595       s_addc_u32  s1, s1, 0
596       s_setpc_b64 s[0:1]
597
598
599        for var i =0; i< 128; i++
600             // be careful to make here a 64Byte aligned address, which could improve performance...
601             buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
602             buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
603
604         if i!=127
605         s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
606             s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
607             end
608        end
609
610 else   // BUFFER_STORE
611       v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
612       v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
613       v_mul_i32_i24 v2, v3, 8   // tid*8
614       v_mov_b32 v3, 256*2
615       s_mov_b32 m0, 0x10000
616       s_mov_b32 s0, s_save_buf_rsrc3
617       s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
618       s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
619
620 L_SAVE_LDS_LOOP_VECTOR:
621       ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
622       s_waitcnt lgkmcnt(0)
623       buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
624 //      s_waitcnt vmcnt(0)
625       v_add_u32 v2, vcc[0:1], v2, v3
626       v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
627       s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
628
629       // restore rsrc3
630       s_mov_b32 s_save_buf_rsrc3, s0
631
632 end
633
634 L_SAVE_LDS_DONE:
635
636
637     /*          save VGPRs  - set the Rest VGPRs        */
638     //////////////////////////////////////////////////////////////////////////////////////
639   L_SAVE_VGPR:
640     // VGPR SR memory offset: 0
641     // TODO rearrange the RSRC words to use swizzle for VGPR save...
642
643     s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
644     s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
645     s_mov_b32       exec_hi, 0xFFFFFFFF
646
647     s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
648     s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
649     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
650     s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
651     if (SWIZZLE_EN)
652         s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
653     else
654         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
655     end
656
657
658     // VGPR Allocated in 4-GPR granularity
659
660 if G8SR_VGPR_SR_IN_DWX4
661         // the const stride for DWx4 is 4*4 bytes
662         s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
663         s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
664
665         s_mov_b32         m0, 4     // skip first 4 VGPRs
666         s_cmp_lt_u32      m0, s_save_alloc_size
667         s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
668
669         s_set_gpr_idx_on  m0, 0x1   // This will change M0
670         s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
671 L_SAVE_VGPR_LOOP:
672         v_mov_b32         v0, v0   // v0 = v[0+m0]
673         v_mov_b32         v1, v1
674         v_mov_b32         v2, v2
675         v_mov_b32         v3, v3
676
677
678         buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
679         s_add_u32         m0, m0, 4
680         s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
681         s_cmp_lt_u32      m0, s_save_alloc_size
682     s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
683     s_set_gpr_idx_off
684 L_SAVE_VGPR_LOOP_END:
685
686         s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
687         s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
688 else
689     // VGPR store using dw burst
690     s_mov_b32         m0, 0x4   //VGPR initial index value =0
691     s_cmp_lt_u32      m0, s_save_alloc_size
692     s_cbranch_scc0    L_SAVE_VGPR_END
693
694
695     s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
696     s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
697
698   L_SAVE_VGPR_LOOP:
699     v_mov_b32       v0, v0              //v0 = v[0+m0]
700     v_mov_b32       v1, v1              //v0 = v[0+m0]
701     v_mov_b32       v2, v2              //v0 = v[0+m0]
702     v_mov_b32       v3, v3              //v0 = v[0+m0]
703
704     if(USE_MTBUF_INSTEAD_OF_MUBUF)
705         tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
706     else
707         buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
708         buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
709         buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
710         buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
711     end
712
713     s_add_u32       m0, m0, 4                                                       //next vgpr index
714     s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
715     s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
716     s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
717     s_set_gpr_idx_off
718 end
719
720 L_SAVE_VGPR_END:
721
722
723
724
725
726
727     /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
728     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
729         s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
730         s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
731         s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
732         s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
733     else
734     end
735
736 // Save Done timestamp
737 if G8SR_DEBUG_TIMESTAMP
738         s_memrealtime   s_g8sr_ts_save_d
739         // SGPR SR memory offset : size(VGPR)
740         get_vgpr_size_bytes(s_save_mem_offset)
741         s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
742         s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
743         // Need reset rsrc2??
744         s_mov_b32 m0, s_save_mem_offset
745         s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
746         s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
747 end
748
749
750     s_branch    L_END_PGM
751
752
753
754 /**************************************************************************/
755 /*                      restore routine                                   */
756 /**************************************************************************/
757
758 L_RESTORE:
759     /*      Setup Resource Contants    */
760     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
761         //calculate wd_addr using absolute thread id
762         v_readlane_b32 s_restore_tmp, v9, 0
763         s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
764         s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
765         s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
766         s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
767         s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
768     else
769     end
770
771 if G8SR_DEBUG_TIMESTAMP
772         s_memrealtime   s_g8sr_ts_restore_s
773         s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
774         // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
775         s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
776         s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
777 end
778
779
780
781     s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
782     s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
783     s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
784     s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
785     s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
786     s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
787     s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
788     s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
789     s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
790     s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
791     s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
792
793     /*      global mem offset           */
794 //  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
795
796     /*      the first wave in the threadgroup    */
797     s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
798     s_cbranch_scc0  L_RESTORE_VGPR
799
800     /*          restore LDS     */
801     //////////////////////////////
802   L_RESTORE_LDS:
803
804     s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
805     s_mov_b32       exec_hi, 0xFFFFFFFF
806
807     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
808     s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
809     s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
810     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
811     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
812     s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
813
814     // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
815     //
816     get_vgpr_size_bytes(s_restore_mem_offset)
817     get_sgpr_size_bytes(s_restore_tmp)
818     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
819     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
820
821
822     if (SWIZZLE_EN)
823         s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
824     else
825         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
826     end
827     s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
828
829   L_RESTORE_LDS_LOOP:
830     if (SAVE_LDS)
831         buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
832         buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
833     end
834     s_add_u32       m0, m0, 256*2                                               // 128 DW
835     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
836     s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
837     s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
838
839
840     /*          restore VGPRs       */
841     //////////////////////////////
842   L_RESTORE_VGPR:
843         // VGPR SR memory offset : 0
844     s_mov_b32       s_restore_mem_offset, 0x0
845     s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
846     s_mov_b32       exec_hi, 0xFFFFFFFF
847
848     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
849     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
850     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
851     s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
852     if (SWIZZLE_EN)
853         s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
854     else
855         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
856     end
857
858 if G8SR_VGPR_SR_IN_DWX4
859      get_vgpr_size_bytes(s_restore_mem_offset)
860      s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
861
862      // the const stride for DWx4 is 4*4 bytes
863      s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
864      s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
865
866      s_mov_b32         m0, s_restore_alloc_size
867      s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
868
869 L_RESTORE_VGPR_LOOP:
870      buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
871      s_waitcnt vmcnt(0)
872      s_sub_u32         m0, m0, 4
873      v_mov_b32         v0, v0   // v[0+m0] = v0
874      v_mov_b32         v1, v1
875      v_mov_b32         v2, v2
876      v_mov_b32         v3, v3
877      s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
878      s_cmp_eq_u32      m0, 0x8000
879      s_cbranch_scc0    L_RESTORE_VGPR_LOOP
880      s_set_gpr_idx_off
881
882      s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
883      s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
884
885 else
886     // VGPR load using dw burst
887     s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
888     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
889     s_mov_b32       m0, 4                               //VGPR initial index value = 1
890     s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
891     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
892
893   L_RESTORE_VGPR_LOOP:
894     if(USE_MTBUF_INSTEAD_OF_MUBUF)
895         tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
896     else
897         buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
898         buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
899         buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
900         buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
901     end
902     s_waitcnt       vmcnt(0)                                                                //ensure data ready
903     v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
904     v_mov_b32       v1, v1
905     v_mov_b32       v2, v2
906     v_mov_b32       v3, v3
907     s_add_u32       m0, m0, 4                                                               //next vgpr index
908     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
909     s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
910     s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
911     s_set_gpr_idx_off
912                                                                                             /* VGPR restore on v0 */
913     if(USE_MTBUF_INSTEAD_OF_MUBUF)
914         tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
915     else
916         buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
917         buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
918         buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
919         buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
920     end
921
922 end
923
924     /*          restore SGPRs       */
925     //////////////////////////////
926
927     // SGPR SR memory offset : size(VGPR)
928     get_vgpr_size_bytes(s_restore_mem_offset)
929     get_sgpr_size_bytes(s_restore_tmp)
930     s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
931     s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
932     // TODO, change RSRC word to rearrange memory layout for SGPRS
933
934     s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
935     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
936     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
937
938     if (SGPR_SAVE_USE_SQC)
939         s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
940     else
941         s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
942     end
943     if (SWIZZLE_EN)
944         s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
945     else
946         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
947     end
948
949     /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
950        However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
951     */
952     s_mov_b32 m0, s_restore_alloc_size
953
954  L_RESTORE_SGPR_LOOP:
955     read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
956     s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
957
958     s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
959
960     s_movreld_b64   s0, s0      //s[0+m0] = s0
961     s_movreld_b64   s2, s2
962     s_movreld_b64   s4, s4
963     s_movreld_b64   s6, s6
964     s_movreld_b64   s8, s8
965     s_movreld_b64   s10, s10
966     s_movreld_b64   s12, s12
967     s_movreld_b64   s14, s14
968
969     s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
970     s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
971
972     /*      restore HW registers    */
973     //////////////////////////////
974   L_RESTORE_HWREG:
975
976
977 if G8SR_DEBUG_TIMESTAMP
978       s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
979       s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
980 end
981
982     // HWREG SR memory offset : size(VGPR)+size(SGPR)
983     get_vgpr_size_bytes(s_restore_mem_offset)
984     get_sgpr_size_bytes(s_restore_tmp)
985     s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
986
987
988     s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
989     if (SWIZZLE_EN)
990         s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
991     else
992         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
993     end
994
995     read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
996     read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
997     read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
998     read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
999     read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1000     read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
1001     read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
1002     read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
1003     read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
1004     read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
1005     read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
1006     read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
1007
1008     s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
1009
1010     s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
1011
1012     //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1013     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1014         s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
1015         s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
1016     end
1017     if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1018         s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
1019         s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
1020     end
1021
1022     s_mov_b32       m0,         s_restore_m0
1023     s_mov_b32       exec_lo,    s_restore_exec_lo
1024     s_mov_b32       exec_hi,    s_restore_exec_hi
1025
1026     s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1027     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1028     s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1029     s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1030     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1031     //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1032     s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
1033     //reuse s_restore_m0 as a temp register
1034     s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1035     s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1036     s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1037     s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
1038     s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
1039     s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1040     s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1041     s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1042     s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
1043     s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
1044     s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1045     s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
1046
1047     s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
1048     s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
1049     s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
1050
1051     s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
1052
1053 if G8SR_DEBUG_TIMESTAMP
1054     s_memrealtime s_g8sr_ts_restore_d
1055     s_waitcnt lgkmcnt(0)
1056 end
1057
1058 //  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
1059     s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
1060
1061
1062 /**************************************************************************/
1063 /*                      the END                                           */
1064 /**************************************************************************/
1065 L_END_PGM:
1066     s_endpgm
1067
1068 end
1069
1070
1071 /**************************************************************************/
1072 /*                      the helper functions                              */
1073 /**************************************************************************/
1074
1075 //Only for save hwreg to mem
1076 function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1077         s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
1078         s_mov_b32 m0, s_mem_offset
1079         s_buffer_store_dword s, s_rsrc, m0      glc:1
1080         s_add_u32       s_mem_offset, s_mem_offset, 4
1081         s_mov_b32   m0, exec_lo
1082 end
1083
1084
1085 // HWREG are saved before SGPRs, so all HWREG could be use.
1086 function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1087
1088         s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
1089         s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
1090         s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
1091         s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
1092         s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
1093         s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
1094 end
1095
1096
1097 function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1098     s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
1099     s_add_u32       s_mem_offset, s_mem_offset, 4
1100 end
1101
1102 function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1103     s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
1104     s_sub_u32       s_mem_offset, s_mem_offset, 4*16
1105 end
1106
1107
1108
1109 function get_lds_size_bytes(s_lds_size_byte)
1110     // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
1111     s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
1112     s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
1113 end
1114
1115 function get_vgpr_size_bytes(s_vgpr_size_byte)
1116     s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
1117     s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
1118     s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
1119 end
1120
1121 function get_sgpr_size_bytes(s_sgpr_size_byte)
1122     s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
1123     s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
1124     s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
1125 end
1126
1127 function get_hwreg_size_bytes
1128     return 128 //HWREG size 128 bytes
1129 end
1130
1131
1132 #endif
1133
1134 static const uint32_t cwsr_trap_gfx8_hex[] = {
1135         0xbf820001, 0xbf820123,
1136         0xb8f4f802, 0x89748674,
1137         0xb8f5f803, 0x8675ff75,
1138         0x00000400, 0xbf850011,
1139         0xc00a1e37, 0x00000000,
1140         0xbf8c007f, 0x87777978,
1141         0xbf840002, 0xb974f802,
1142         0xbe801d78, 0xb8f5f803,
1143         0x8675ff75, 0x000001ff,
1144         0xbf850002, 0x80708470,
1145         0x82718071, 0x8671ff71,
1146         0x0000ffff, 0xb974f802,
1147         0xbe801f70, 0xb8f5f803,
1148         0x8675ff75, 0x00000100,
1149         0xbf840006, 0xbefa0080,
1150         0xb97a0203, 0x8671ff71,
1151         0x0000ffff, 0x80f08870,
1152         0x82f18071, 0xbefa0080,
1153         0xb97a0283, 0xbef60068,
1154         0xbef70069, 0xb8fa1c07,
1155         0x8e7a9c7a, 0x87717a71,
1156         0xb8fa03c7, 0x8e7a9b7a,
1157         0x87717a71, 0xb8faf807,
1158         0x867aff7a, 0x00007fff,
1159         0xb97af807, 0xbef2007e,
1160         0xbef3007f, 0xbefe0180,
1161         0xbf900004, 0xbf8e0002,
1162         0xbf88fffe, 0xbef8007e,
1163         0x8679ff7f, 0x0000ffff,
1164         0x8779ff79, 0x00040000,
1165         0xbefa0080, 0xbefb00ff,
1166         0x00807fac, 0x867aff7f,
1167         0x08000000, 0x8f7a837a,
1168         0x877b7a7b, 0x867aff7f,
1169         0x70000000, 0x8f7a817a,
1170         0x877b7a7b, 0xbeef007c,
1171         0xbeee0080, 0xb8ee2a05,
1172         0x806e816e, 0x8e6e8a6e,
1173         0xb8fa1605, 0x807a817a,
1174         0x8e7a867a, 0x806e7a6e,
1175         0xbefa0084, 0xbefa00ff,
1176         0x01000000, 0xbefe007c,
1177         0xbefc006e, 0xc0611bfc,
1178         0x0000007c, 0x806e846e,
1179         0xbefc007e, 0xbefe007c,
1180         0xbefc006e, 0xc0611c3c,
1181         0x0000007c, 0x806e846e,
1182         0xbefc007e, 0xbefe007c,
1183         0xbefc006e, 0xc0611c7c,
1184         0x0000007c, 0x806e846e,
1185         0xbefc007e, 0xbefe007c,
1186         0xbefc006e, 0xc0611cbc,
1187         0x0000007c, 0x806e846e,
1188         0xbefc007e, 0xbefe007c,
1189         0xbefc006e, 0xc0611cfc,
1190         0x0000007c, 0x806e846e,
1191         0xbefc007e, 0xbefe007c,
1192         0xbefc006e, 0xc0611d3c,
1193         0x0000007c, 0x806e846e,
1194         0xbefc007e, 0xb8f5f803,
1195         0xbefe007c, 0xbefc006e,
1196         0xc0611d7c, 0x0000007c,
1197         0x806e846e, 0xbefc007e,
1198         0xbefe007c, 0xbefc006e,
1199         0xc0611dbc, 0x0000007c,
1200         0x806e846e, 0xbefc007e,
1201         0xbefe007c, 0xbefc006e,
1202         0xc0611dfc, 0x0000007c,
1203         0x806e846e, 0xbefc007e,
1204         0xb8eff801, 0xbefe007c,
1205         0xbefc006e, 0xc0611bfc,
1206         0x0000007c, 0x806e846e,
1207         0xbefc007e, 0xbefe007c,
1208         0xbefc006e, 0xc0611b3c,
1209         0x0000007c, 0x806e846e,
1210         0xbefc007e, 0xbefe007c,
1211         0xbefc006e, 0xc0611b7c,
1212         0x0000007c, 0x806e846e,
1213         0xbefc007e, 0x867aff7f,
1214         0x04000000, 0xbef30080,
1215         0x8773737a, 0xb8ee2a05,
1216         0x806e816e, 0x8e6e8a6e,
1217         0xb8f51605, 0x80758175,
1218         0x8e758475, 0x8e7a8275,
1219         0xbefa00ff, 0x01000000,
1220         0xbef60178, 0x80786e78,
1221         0x82798079, 0xbefc0080,
1222         0xbe802b00, 0xbe822b02,
1223         0xbe842b04, 0xbe862b06,
1224         0xbe882b08, 0xbe8a2b0a,
1225         0xbe8c2b0c, 0xbe8e2b0e,
1226         0xc06b003c, 0x00000000,
1227         0xc06b013c, 0x00000010,
1228         0xc06b023c, 0x00000020,
1229         0xc06b033c, 0x00000030,
1230         0x8078c078, 0x82798079,
1231         0x807c907c, 0xbf0a757c,
1232         0xbf85ffeb, 0xbef80176,
1233         0xbeee0080, 0xbefe00c1,
1234         0xbeff00c1, 0xbefa00ff,
1235         0x01000000, 0xe0724000,
1236         0x6e1e0000, 0xe0724100,
1237         0x6e1e0100, 0xe0724200,
1238         0x6e1e0200, 0xe0724300,
1239         0x6e1e0300, 0xbefe00c1,
1240         0xbeff00c1, 0xb8f54306,
1241         0x8675c175, 0xbf84002c,
1242         0xbf8a0000, 0x867aff73,
1243         0x04000000, 0xbf840028,
1244         0x8e758675, 0x8e758275,
1245         0xbefa0075, 0xb8ee2a05,
1246         0x806e816e, 0x8e6e8a6e,
1247         0xb8fa1605, 0x807a817a,
1248         0x8e7a867a, 0x806e7a6e,
1249         0x806eff6e, 0x00000080,
1250         0xbefa00ff, 0x01000000,
1251         0xbefc0080, 0xd28c0002,
1252         0x000100c1, 0xd28d0003,
1253         0x000204c1, 0xd1060002,
1254         0x00011103, 0x7e0602ff,
1255         0x00000200, 0xbefc00ff,
1256         0x00010000, 0xbe80007b,
1257         0x867bff7b, 0xff7fffff,
1258         0x877bff7b, 0x00058000,
1259         0xd8ec0000, 0x00000002,
1260         0xbf8c007f, 0xe0765000,
1261         0x6e1e0002, 0x32040702,
1262         0xd0c9006a, 0x0000eb02,
1263         0xbf87fff7, 0xbefb0000,
1264         0xbeee00ff, 0x00000400,
1265         0xbefe00c1, 0xbeff00c1,
1266         0xb8f52a05, 0x80758175,
1267         0x8e758275, 0x8e7a8875,
1268         0xbefa00ff, 0x01000000,
1269         0xbefc0084, 0xbf0a757c,
1270         0xbf840015, 0xbf11017c,
1271         0x8075ff75, 0x00001000,
1272         0x7e000300, 0x7e020301,
1273         0x7e040302, 0x7e060303,
1274         0xe0724000, 0x6e1e0000,
1275         0xe0724100, 0x6e1e0100,
1276         0xe0724200, 0x6e1e0200,
1277         0xe0724300, 0x6e1e0300,
1278         0x807c847c, 0x806eff6e,
1279         0x00000400, 0xbf0a757c,
1280         0xbf85ffef, 0xbf9c0000,
1281         0xbf8200ca, 0xbef8007e,
1282         0x8679ff7f, 0x0000ffff,
1283         0x8779ff79, 0x00040000,
1284         0xbefa0080, 0xbefb00ff,
1285         0x00807fac, 0x8676ff7f,
1286         0x08000000, 0x8f768376,
1287         0x877b767b, 0x8676ff7f,
1288         0x70000000, 0x8f768176,
1289         0x877b767b, 0x8676ff7f,
1290         0x04000000, 0xbf84001e,
1291         0xbefe00c1, 0xbeff00c1,
1292         0xb8f34306, 0x8673c173,
1293         0xbf840019, 0x8e738673,
1294         0x8e738273, 0xbefa0073,
1295         0xb8f22a05, 0x80728172,
1296         0x8e728a72, 0xb8f61605,
1297         0x80768176, 0x8e768676,
1298         0x80727672, 0x8072ff72,
1299         0x00000080, 0xbefa00ff,
1300         0x01000000, 0xbefc0080,
1301         0xe0510000, 0x721e0000,
1302         0xe0510100, 0x721e0000,
1303         0x807cff7c, 0x00000200,
1304         0x8072ff72, 0x00000200,
1305         0xbf0a737c, 0xbf85fff6,
1306         0xbef20080, 0xbefe00c1,
1307         0xbeff00c1, 0xb8f32a05,
1308         0x80738173, 0x8e738273,
1309         0x8e7a8873, 0xbefa00ff,
1310         0x01000000, 0xbef60072,
1311         0x8072ff72, 0x00000400,
1312         0xbefc0084, 0xbf11087c,
1313         0x8073ff73, 0x00008000,
1314         0xe0524000, 0x721e0000,
1315         0xe0524100, 0x721e0100,
1316         0xe0524200, 0x721e0200,
1317         0xe0524300, 0x721e0300,
1318         0xbf8c0f70, 0x7e000300,
1319         0x7e020301, 0x7e040302,
1320         0x7e060303, 0x807c847c,
1321         0x8072ff72, 0x00000400,
1322         0xbf0a737c, 0xbf85ffee,
1323         0xbf9c0000, 0xe0524000,
1324         0x761e0000, 0xe0524100,
1325         0x761e0100, 0xe0524200,
1326         0x761e0200, 0xe0524300,
1327         0x761e0300, 0xb8f22a05,
1328         0x80728172, 0x8e728a72,
1329         0xb8f61605, 0x80768176,
1330         0x8e768676, 0x80727672,
1331         0x80f2c072, 0xb8f31605,
1332         0x80738173, 0x8e738473,
1333         0x8e7a8273, 0xbefa00ff,
1334         0x01000000, 0xbefc0073,
1335         0xc031003c, 0x00000072,
1336         0x80f2c072, 0xbf8c007f,
1337         0x80fc907c, 0xbe802d00,
1338         0xbe822d02, 0xbe842d04,
1339         0xbe862d06, 0xbe882d08,
1340         0xbe8a2d0a, 0xbe8c2d0c,
1341         0xbe8e2d0e, 0xbf06807c,
1342         0xbf84fff1, 0xb8f22a05,
1343         0x80728172, 0x8e728a72,
1344         0xb8f61605, 0x80768176,
1345         0x8e768676, 0x80727672,
1346         0xbefa0084, 0xbefa00ff,
1347         0x01000000, 0xc0211cfc,
1348         0x00000072, 0x80728472,
1349         0xc0211c3c, 0x00000072,
1350         0x80728472, 0xc0211c7c,
1351         0x00000072, 0x80728472,
1352         0xc0211bbc, 0x00000072,
1353         0x80728472, 0xc0211bfc,
1354         0x00000072, 0x80728472,
1355         0xc0211d3c, 0x00000072,
1356         0x80728472, 0xc0211d7c,
1357         0x00000072, 0x80728472,
1358         0xc0211a3c, 0x00000072,
1359         0x80728472, 0xc0211a7c,
1360         0x00000072, 0x80728472,
1361         0xc0211dfc, 0x00000072,
1362         0x80728472, 0xc0211b3c,
1363         0x00000072, 0x80728472,
1364         0xc0211b7c, 0x00000072,
1365         0x80728472, 0xbf8c007f,
1366         0x8671ff71, 0x0000ffff,
1367         0xbefc0073, 0xbefe006e,
1368         0xbeff006f, 0x867375ff,
1369         0x000003ff, 0xb9734803,
1370         0x867375ff, 0xfffff800,
1371         0x8f738b73, 0xb973a2c3,
1372         0xb977f801, 0x8673ff71,
1373         0xf0000000, 0x8f739c73,
1374         0x8e739073, 0xbef60080,
1375         0x87767376, 0x8673ff71,
1376         0x08000000, 0x8f739b73,
1377         0x8e738f73, 0x87767376,
1378         0x8673ff74, 0x00800000,
1379         0x8f739773, 0xb976f807,
1380         0x86fe7e7e, 0x86ea6a6a,
1381         0xb974f802, 0xbf8a0000,
1382         0x95807370, 0xbf810000,
1383 };
1384
This page took 0.112428 seconds and 4 git commands to generate.