drivers/gpu/drm/amd/amdgpu/gfx_v10_3_0_cleaner_shader.asm

   1 /* SPDX-License-Identifier: MIT */
   2 /*
   3  * Copyright 2025 Advanced Micro Devices, Inc.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21  * OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 // This shader is to clean LDS, SGPRs and VGPRs. It is  first 64 Dwords or 256 bytes of 192 Dwords cleaner shader.
  25 //To turn this shader program on for complitaion change this to main and lower shader main to main_1
  26
  27 // GFX10.3 : Clear SGPRs, VGPRs and LDS
  28 //   Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot
  29 //   Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD
  30 //   Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS)
  31 //      It takes 2 workgroups to use all of LDS: one on each CU of the WGP
  32 //   Each wave clears SGPRs 0 - 107
  33 //   Each wave clears VGPRs 0 - 63
  34 //   The first wave of the workgroup clears its 64KB of LDS
  35 //   The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
  36 //       before any wave in the workgroup could end.  Without this, it is possible not all SGPRs get cleared.
  37
  38
  39 shader main
  40   asic(GFX10)
  41   type(CS)
  42   wave_size(32)
  43 // Note: original source code from SQ team
  44
  45 //
  46 // Create 32 waves in a threadgroup (CS waves)
  47 // Each allocates 64 VGPRs
  48 // The workgroup allocates all of LDS (64kbytes)
  49 //
  50 // Takes about 2500 clocks to run.
  51 //   (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks)
  52 //
  53   S_BARRIER
  54   s_mov_b32     s2, 0x00000038  // Loop 64/8=8 times  (loop unrolled for performance)
  55   s_mov_b32     m0, 0
  56   //
  57   // CLEAR VGPRs
  58   //
  59 label_0005:
  60   v_movreld_b32     v0, 0
  61   v_movreld_b32     v1, 0
  62   v_movreld_b32     v2, 0
  63   v_movreld_b32     v3, 0
  64   v_movreld_b32     v4, 0
  65   v_movreld_b32     v5, 0
  66   v_movreld_b32     v6, 0
  67   v_movreld_b32     v7, 0
  68   s_mov_b32         m0, s2
  69   s_sub_u32     s2, s2, 8
  70   s_cbranch_scc0  label_0005
  71   //
  72   s_mov_b32     s2, 0x80000000                     // Bit31 is first_wave
  73   s_and_b32     s2, s2, s0                                  // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
  74   s_cbranch_scc0  label_0023                         // Clean LDS if its first wave of ThreadGroup/WorkGroup
  75   // CLEAR LDS
  76   //
  77   s_mov_b32 exec_lo, 0xffffffff
  78   s_mov_b32 exec_hi, 0xffffffff
  79   v_mbcnt_lo_u32_b32  v1, exec_hi, 0          // Set V1 to thread-ID (0..63)
  80   v_mbcnt_hi_u32_b32  v1, exec_lo, v1        // Set V1 to thread-ID (0..63)
  81   v_mul_u32_u24  v1, 0x00000008, v1          // * 8, so each thread is a double-dword address (8byte)
  82   s_mov_b32     s2, 0x00000003f                    // 64 loop iterations
  83   s_mov_b32     m0, 0xffffffff
  84   // Clear all of LDS space
  85   // Each FirstWave of WorkGroup clears 64kbyte block
  86
  87 label_001F:
  88   ds_write2_b64  v1, v[2:3], v[2:3] offset1:32
  89   ds_write2_b64  v1, v[4:5], v[4:5] offset0:64 offset1:96
  90   v_add_co_u32     v1, vcc, 0x00000400, v1
  91   s_sub_u32     s2, s2, 1
  92   s_cbranch_scc0  label_001F
  93
  94   //
  95   // CLEAR SGPRs
  96   //
  97 label_0023:
  98   s_mov_b32     m0, 0x00000068  // Loop 108/4=27 times  (loop unrolled for performance)
  99 label_sgpr_loop:
 100   s_movreld_b32     s0, 0
 101   s_movreld_b32     s1, 0
 102   s_movreld_b32     s2, 0
 103   s_movreld_b32     s3, 0
 104   s_sub_u32         m0, m0, 4
 105   s_cbranch_scc0  label_sgpr_loop
 106
 107   //clear vcc
 108   s_mov_b32 flat_scratch_lo, 0   //clear  flat scratch lo SGPR
 109   s_mov_b32 flat_scratch_hi, 0   //clear  flat scratch hi SGPR
 110   s_mov_b64 vcc, 0          //clear vcc
 111   s_mov_b64 ttmp0, 0        //Clear ttmp0 and ttmp1
 112   s_mov_b64 ttmp2, 0        //Clear ttmp2 and ttmp3
 113   s_mov_b64 ttmp4, 0        //Clear ttmp4 and ttmp5
 114   s_mov_b64 ttmp6, 0        //Clear ttmp6 and ttmp7
 115   s_mov_b64 ttmp8, 0        //Clear ttmp8 and ttmp9
 116   s_mov_b64 ttmp10, 0       //Clear ttmp10 and ttmp11
 117   s_mov_b64 ttmp12, 0       //Clear ttmp12 and ttmp13
 118   s_mov_b64 ttmp14, 0       //Clear ttmp14 and ttmp15
 119
 120  s_endpgm
 121
 122 end
 123
 124