drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3_cleaner_shader.asm

   1 /* SPDX-License-Identifier: MIT */
   2 /*
   3  * Copyright 2024 Advanced Micro Devices, Inc.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21  * OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 // This shader is to clean LDS, SGPRs and VGPRs. It is  first 64 Dwords or 256 bytes of 192 Dwords cleaner shader.
  25 //To turn this shader program on for complitaion change this to main and lower shader main to main_1
  26
  27 // MI300 : Clear SGPRs, VGPRs and LDS
  28 //   Uses two kernels launched separately:
  29 //   1. Clean VGPRs, LDS, and lower SGPRs
  30 //        Launches one workgroup per CU, each workgroup with 4x wave64 per SIMD in the CU
  31 //        Waves are "wave64" and have 128 VGPRs each, which uses all 512 VGPRs per SIMD
  32 //        Waves in the workgroup share the 64KB of LDS
  33 //        Each wave clears SGPRs 0 - 95. Because there are 4 waves/SIMD, this is physical SGPRs 0-383
  34 //        Each wave clears 128 VGPRs, so all 512 in the SIMD
  35 //        The first wave of the workgroup clears its 64KB of LDS
  36 //        The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
  37 //          before any wave in the workgroup could end.  Without this, it is possible not all SGPRs get cleared.
  38 //    2. Clean remaining SGPRs
  39 //        Launches a workgroup with 24 waves per workgroup, yielding 6 waves per SIMD in each CU
  40 //        Waves are allocating 96 SGPRs
  41 //          CP sets up SPI_RESOURCE_RESERVE_* registers to prevent these waves from allocating SGPRs 0-223.
  42 //          As such, these 6 waves per SIMD are allocated physical SGPRs 224-799
  43 //        Barriers do not work for >16 waves per workgroup, so we cannot start with S_BARRIER
  44 //          Instead, the shader starts with an S_SETHALT 1. Once all waves are launched CP will send unhalt command
  45 //        The shader then clears all SGPRs allocated to it, cleaning out physical SGPRs 224-799
  46
  47 shader main
  48   asic(MI300)
  49   type(CS)
  50   wave_size(64)
  51 // Note: original source code from SQ team
  52
  53 //   (theorhetical fastest = ~512clks vgpr + 1536 lds + ~128 sgpr  = 2176 clks)
  54
  55   s_cmp_eq_u32 s0, 1                                // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_3
  56   s_cbranch_scc0  label_0023                        // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s3 == 1)
  57   S_BARRIER
  58
  59   s_movk_i32    m0, 0x0000
  60   s_mov_b32     s2, 0x00000078  // Loop 128/8=16 times  (loop unrolled for performance)
  61   //
  62   // CLEAR VGPRs
  63   //
  64   s_set_gpr_idx_on  s2, 0x8    // enable Dest VGPR indexing
  65 label_0005:
  66   v_mov_b32     v0, 0
  67   v_mov_b32     v1, 0
  68   v_mov_b32     v2, 0
  69   v_mov_b32     v3, 0
  70   v_mov_b32     v4, 0
  71   v_mov_b32     v5, 0
  72   v_mov_b32     v6, 0
  73   v_mov_b32     v7, 0
  74   s_sub_u32     s2, s2, 8
  75   s_set_gpr_idx_idx  s2
  76   s_cbranch_scc0  label_0005
  77   s_set_gpr_idx_off
  78
  79   //
  80   //
  81
  82   s_mov_b32     s2, 0x80000000                      // Bit31 is first_wave
  83   s_and_b32     s2, s2, s1                          // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
  84   s_cbranch_scc0  label_clean_sgpr_1                // Clean LDS if its first wave of ThreadGroup/WorkGroup
  85   // CLEAR LDS
  86   //
  87   s_mov_b32 exec_lo, 0xffffffff
  88   s_mov_b32 exec_hi, 0xffffffff
  89   v_mbcnt_lo_u32_b32  v1, exec_hi, 0          // Set V1 to thread-ID (0..63)
  90   v_mbcnt_hi_u32_b32  v1, exec_lo, v1         // Set V1 to thread-ID (0..63)
  91   v_mul_u32_u24  v1, 0x00000008, v1           // * 8, so each thread is a double-dword address (8byte)
  92   s_mov_b32     s2, 0x00000003f               // 64 loop iteraions
  93   s_mov_b32     m0, 0xffffffff
  94   // Clear all of LDS space
  95   // Each FirstWave of WorkGroup clears 64kbyte block
  96
  97 label_001F:
  98   ds_write2_b64  v1, v[2:3], v[2:3] offset1:32
  99   ds_write2_b64  v1, v[4:5], v[4:5] offset0:64 offset1:96
 100   v_add_co_u32     v1, vcc, 0x00000400, v1
 101   s_sub_u32     s2, s2, 1
 102   s_cbranch_scc0  label_001F
 103   //
 104   // CLEAR SGPRs
 105   //
 106 label_clean_sgpr_1:
 107   s_mov_b32     m0, 0x0000005c   // Loop 96/4=24 times  (loop unrolled for performance)
 108   s_nop 0
 109 label_sgpr_loop:
 110   s_movreld_b32     s0, 0
 111   s_movreld_b32     s1, 0
 112   s_movreld_b32     s2, 0
 113   s_movreld_b32     s3, 0
 114   s_sub_u32         m0, m0, 4
 115   s_cbranch_scc0  label_sgpr_loop
 116
 117   //clear vcc, flat scratch
 118   s_mov_b32 flat_scratch_lo, 0   //clear  flat scratch lo SGPR
 119   s_mov_b32 flat_scratch_hi, 0   //clear  flat scratch hi SGPR
 120   s_mov_b64 vcc, 0               //clear vcc
 121   s_mov_b64 ttmp0, 0             //Clear ttmp0 and ttmp1
 122   s_mov_b64 ttmp2, 0             //Clear ttmp2 and ttmp3
 123   s_mov_b64 ttmp4, 0             //Clear ttmp4 and ttmp5
 124   s_mov_b64 ttmp6, 0             //Clear ttmp6 and ttmp7
 125   s_mov_b64 ttmp8, 0             //Clear ttmp8 and ttmp9
 126   s_mov_b64 ttmp10, 0            //Clear ttmp10 and ttmp11
 127   s_mov_b64 ttmp12, 0            //Clear ttmp12 and ttmp13
 128   s_mov_b64 ttmp14, 0            //Clear ttmp14 and ttmp15
 129 s_endpgm
 130
 131 label_0023:
 132
 133   s_sethalt 1
 134
 135   s_mov_b32     m0, 0x0000005c   // Loop 96/4=24 times  (loop unrolled for performance)
 136   s_nop 0
 137 label_sgpr_loop1:
 138
 139   s_movreld_b32     s0, 0
 140   s_movreld_b32     s1, 0
 141   s_movreld_b32     s2, 0
 142   s_movreld_b32     s3, 0
 143   s_sub_u32         m0, m0, 4
 144   s_cbranch_scc0  label_sgpr_loop1
 145
 146   //clear vcc, flat scratch
 147   s_mov_b32 flat_scratch_lo, 0   //clear  flat scratch lo SGPR
 148   s_mov_b32 flat_scratch_hi, 0   //clear  flat scratch hi SGPR
 149   s_mov_b64 vcc, 0xee            //clear vcc
 150
 151 s_endpgm
 152 end
 153