1 /* SPDX-License-Identifier: MIT */
3 * Copyright 2024 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 // This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 192 Dwords cleaner shader.
25 //To turn this shader program on for complitaion change this to main and lower shader main to main_1
27 // Navi3 : Clear SGPRs, VGPRs and LDS
28 // Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot
29 // Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD
30 // Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS)
31 // It takes 2 workgroups to use all of LDS: one on each CU of the WGP
32 // Each wave clears SGPRs 0 - 107
33 // Each wave clears VGPRs 0 - 63
34 // The first wave of the workgroup clears its 64KB of LDS
35 // The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
36 // before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared.
42 // Note: original source code from SQ team
44 // Takes about 2500 clocks to run.
45 // (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks)
52 s_mov_b32 m0, 0x00000058 // Loop 96/8=12 times (loop unrolled for performance)
64 s_cbranch_scc0 label_0005
68 s_mov_b32 s2, 0x80000000 // Bit31 is first_wave
69 s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
70 s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup
73 s_mov_b32 exec_lo, 0xffffffff
74 s_mov_b32 exec_hi, 0xffffffff
75 v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63)
76 v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63)
77 v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte)
78 s_mov_b32 s2, 0x00000003f // 64 loop iterations
79 s_mov_b32 m0, 0xffffffff
80 // Clear all of LDS space
81 // Each FirstWave of WorkGroup clears 64kbyte block
84 ds_write2_b64 v1, v[2:3], v[2:3] offset1:32
85 ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96
86 v_add_co_u32 v1, vcc, 0x00000400, v1
88 s_cbranch_scc0 label_001F
93 s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance)
100 s_cbranch_scc0 label_sgpr_loop
103 s_mov_b64 vcc, 0 //clear vcc
104 s_mov_b32 flat_scratch_lo, 0 //clear flat scratch lo SGPR
105 s_mov_b32 flat_scratch_hi, 0 //clear flat scratch hi SGPR
106 s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1
107 s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3
108 s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5
109 s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7
110 s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9
111 s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11
112 s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13
113 s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15