]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
Merge tag 'for-5.10-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_amdkfd_gfx_v9.c
1 /*
2  * Copyright 2014-2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 #include "amdgpu.h"
23 #include "amdgpu_amdkfd.h"
24 #include "gc/gc_9_0_offset.h"
25 #include "gc/gc_9_0_sh_mask.h"
26 #include "vega10_enum.h"
27 #include "sdma0/sdma0_4_0_offset.h"
28 #include "sdma0/sdma0_4_0_sh_mask.h"
29 #include "sdma1/sdma1_4_0_offset.h"
30 #include "sdma1/sdma1_4_0_sh_mask.h"
31 #include "athub/athub_1_0_offset.h"
32 #include "athub/athub_1_0_sh_mask.h"
33 #include "oss/osssys_4_0_offset.h"
34 #include "oss/osssys_4_0_sh_mask.h"
35 #include "soc15_common.h"
36 #include "v9_structs.h"
37 #include "soc15.h"
38 #include "soc15d.h"
39 #include "gfx_v9_0.h"
40
41 enum hqd_dequeue_request_type {
42         NO_ACTION = 0,
43         DRAIN_PIPE,
44         RESET_WAVES
45 };
46
47 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
48 {
49         return (struct amdgpu_device *)kgd;
50 }
51
52 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
53                         uint32_t queue, uint32_t vmid)
54 {
55         struct amdgpu_device *adev = get_amdgpu_device(kgd);
56
57         mutex_lock(&adev->srbm_mutex);
58         soc15_grbm_select(adev, mec, pipe, queue, vmid);
59 }
60
61 static void unlock_srbm(struct kgd_dev *kgd)
62 {
63         struct amdgpu_device *adev = get_amdgpu_device(kgd);
64
65         soc15_grbm_select(adev, 0, 0, 0, 0);
66         mutex_unlock(&adev->srbm_mutex);
67 }
68
69 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
70                                 uint32_t queue_id)
71 {
72         struct amdgpu_device *adev = get_amdgpu_device(kgd);
73
74         uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
75         uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
76
77         lock_srbm(kgd, mec, pipe, queue_id, 0);
78 }
79
80 static uint64_t get_queue_mask(struct amdgpu_device *adev,
81                                uint32_t pipe_id, uint32_t queue_id)
82 {
83         unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
84                         queue_id;
85
86         return 1ull << bit;
87 }
88
89 static void release_queue(struct kgd_dev *kgd)
90 {
91         unlock_srbm(kgd);
92 }
93
94 void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
95                                         uint32_t sh_mem_config,
96                                         uint32_t sh_mem_ape1_base,
97                                         uint32_t sh_mem_ape1_limit,
98                                         uint32_t sh_mem_bases)
99 {
100         struct amdgpu_device *adev = get_amdgpu_device(kgd);
101
102         lock_srbm(kgd, 0, 0, 0, vmid);
103
104         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
105         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
106         /* APE1 no longer exists on GFX9 */
107
108         unlock_srbm(kgd);
109 }
110
111 int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
112                                         unsigned int vmid)
113 {
114         struct amdgpu_device *adev = get_amdgpu_device(kgd);
115
116         /*
117          * We have to assume that there is no outstanding mapping.
118          * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
119          * a mapping is in progress or because a mapping finished
120          * and the SW cleared it.
121          * So the protocol is to always wait & clear.
122          */
123         uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
124                         ATC_VMID0_PASID_MAPPING__VALID_MASK;
125
126         /*
127          * need to do this twice, once for gfx and once for mmhub
128          * for ATC add 16 to VMID for mmhub, for IH different registers.
129          * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
130          */
131
132         WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
133                pasid_mapping);
134
135         while (!(RREG32(SOC15_REG_OFFSET(
136                                 ATHUB, 0,
137                                 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
138                  (1U << vmid)))
139                 cpu_relax();
140
141         WREG32(SOC15_REG_OFFSET(ATHUB, 0,
142                                 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
143                1U << vmid);
144
145         /* Mapping vmid to pasid also for IH block */
146         WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
147                pasid_mapping);
148
149         WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
150                pasid_mapping);
151
152         while (!(RREG32(SOC15_REG_OFFSET(
153                                 ATHUB, 0,
154                                 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
155                  (1U << (vmid + 16))))
156                 cpu_relax();
157
158         WREG32(SOC15_REG_OFFSET(ATHUB, 0,
159                                 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
160                1U << (vmid + 16));
161
162         /* Mapping vmid to pasid also for IH block */
163         WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
164                pasid_mapping);
165         return 0;
166 }
167
168 /* TODO - RING0 form of field is obsolete, seems to date back to SI
169  * but still works
170  */
171
172 int kgd_gfx_v9_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
173 {
174         struct amdgpu_device *adev = get_amdgpu_device(kgd);
175         uint32_t mec;
176         uint32_t pipe;
177
178         mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
179         pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
180
181         lock_srbm(kgd, mec, pipe, 0, 0);
182
183         WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
184                 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
185                 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
186
187         unlock_srbm(kgd);
188
189         return 0;
190 }
191
192 static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
193                                 unsigned int engine_id,
194                                 unsigned int queue_id)
195 {
196         uint32_t sdma_engine_reg_base = 0;
197         uint32_t sdma_rlc_reg_offset;
198
199         switch (engine_id) {
200         default:
201                 dev_warn(adev->dev,
202                          "Invalid sdma engine id (%d), using engine id 0\n",
203                          engine_id);
204                 fallthrough;
205         case 0:
206                 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0,
207                                 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
208                 break;
209         case 1:
210                 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0,
211                                 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
212                 break;
213         }
214
215         sdma_rlc_reg_offset = sdma_engine_reg_base
216                 + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL);
217
218         pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
219                  queue_id, sdma_rlc_reg_offset);
220
221         return sdma_rlc_reg_offset;
222 }
223
224 static inline struct v9_mqd *get_mqd(void *mqd)
225 {
226         return (struct v9_mqd *)mqd;
227 }
228
229 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
230 {
231         return (struct v9_sdma_mqd *)mqd;
232 }
233
234 int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
235                         uint32_t queue_id, uint32_t __user *wptr,
236                         uint32_t wptr_shift, uint32_t wptr_mask,
237                         struct mm_struct *mm)
238 {
239         struct amdgpu_device *adev = get_amdgpu_device(kgd);
240         struct v9_mqd *m;
241         uint32_t *mqd_hqd;
242         uint32_t reg, hqd_base, data;
243
244         m = get_mqd(mqd);
245
246         acquire_queue(kgd, pipe_id, queue_id);
247
248         /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
249         mqd_hqd = &m->cp_mqd_base_addr_lo;
250         hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
251
252         for (reg = hqd_base;
253              reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
254                 WREG32_RLC(reg, mqd_hqd[reg - hqd_base]);
255
256
257         /* Activate doorbell logic before triggering WPTR poll. */
258         data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
259                              CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
260         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
261
262         if (wptr) {
263                 /* Don't read wptr with get_user because the user
264                  * context may not be accessible (if this function
265                  * runs in a work queue). Instead trigger a one-shot
266                  * polling read from memory in the CP. This assumes
267                  * that wptr is GPU-accessible in the queue's VMID via
268                  * ATC or SVM. WPTR==RPTR before starting the poll so
269                  * the CP starts fetching new commands from the right
270                  * place.
271                  *
272                  * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
273                  * tricky. Assume that the queue didn't overflow. The
274                  * number of valid bits in the 32-bit RPTR depends on
275                  * the queue size. The remaining bits are taken from
276                  * the saved 64-bit WPTR. If the WPTR wrapped, add the
277                  * queue size.
278                  */
279                 uint32_t queue_size =
280                         2 << REG_GET_FIELD(m->cp_hqd_pq_control,
281                                            CP_HQD_PQ_CONTROL, QUEUE_SIZE);
282                 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
283
284                 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
285                         guessed_wptr += queue_size;
286                 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
287                 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
288
289                 WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
290                        lower_32_bits(guessed_wptr));
291                 WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
292                        upper_32_bits(guessed_wptr));
293                 WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
294                        lower_32_bits((uintptr_t)wptr));
295                 WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
296                        upper_32_bits((uintptr_t)wptr));
297                 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
298                        (uint32_t)get_queue_mask(adev, pipe_id, queue_id));
299         }
300
301         /* Start the EOP fetcher */
302         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
303                REG_SET_FIELD(m->cp_hqd_eop_rptr,
304                              CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
305
306         data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
307         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
308
309         release_queue(kgd);
310
311         return 0;
312 }
313
314 int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
315                             uint32_t pipe_id, uint32_t queue_id,
316                             uint32_t doorbell_off)
317 {
318         struct amdgpu_device *adev = get_amdgpu_device(kgd);
319         struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
320         struct v9_mqd *m;
321         uint32_t mec, pipe;
322         int r;
323
324         m = get_mqd(mqd);
325
326         acquire_queue(kgd, pipe_id, queue_id);
327
328         mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
329         pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
330
331         pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
332                  mec, pipe, queue_id);
333
334         spin_lock(&adev->gfx.kiq.ring_lock);
335         r = amdgpu_ring_alloc(kiq_ring, 7);
336         if (r) {
337                 pr_err("Failed to alloc KIQ (%d).\n", r);
338                 goto out_unlock;
339         }
340
341         amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
342         amdgpu_ring_write(kiq_ring,
343                           PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
344                           PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */
345                           PACKET3_MAP_QUEUES_QUEUE(queue_id) |
346                           PACKET3_MAP_QUEUES_PIPE(pipe) |
347                           PACKET3_MAP_QUEUES_ME((mec - 1)) |
348                           PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
349                           PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
350                           PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */
351                           PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
352         amdgpu_ring_write(kiq_ring,
353                           PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
354         amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo);
355         amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi);
356         amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo);
357         amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
358         amdgpu_ring_commit(kiq_ring);
359
360 out_unlock:
361         spin_unlock(&adev->gfx.kiq.ring_lock);
362         release_queue(kgd);
363
364         return r;
365 }
366
367 int kgd_gfx_v9_hqd_dump(struct kgd_dev *kgd,
368                         uint32_t pipe_id, uint32_t queue_id,
369                         uint32_t (**dump)[2], uint32_t *n_regs)
370 {
371         struct amdgpu_device *adev = get_amdgpu_device(kgd);
372         uint32_t i = 0, reg;
373 #define HQD_N_REGS 56
374 #define DUMP_REG(addr) do {                             \
375                 if (WARN_ON_ONCE(i >= HQD_N_REGS))      \
376                         break;                          \
377                 (*dump)[i][0] = (addr) << 2;            \
378                 (*dump)[i++][1] = RREG32(addr);         \
379         } while (0)
380
381         *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
382         if (*dump == NULL)
383                 return -ENOMEM;
384
385         acquire_queue(kgd, pipe_id, queue_id);
386
387         for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
388              reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
389                 DUMP_REG(reg);
390
391         release_queue(kgd);
392
393         WARN_ON_ONCE(i != HQD_N_REGS);
394         *n_regs = i;
395
396         return 0;
397 }
398
399 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
400                              uint32_t __user *wptr, struct mm_struct *mm)
401 {
402         struct amdgpu_device *adev = get_amdgpu_device(kgd);
403         struct v9_sdma_mqd *m;
404         uint32_t sdma_rlc_reg_offset;
405         unsigned long end_jiffies;
406         uint32_t data;
407         uint64_t data64;
408         uint64_t __user *wptr64 = (uint64_t __user *)wptr;
409
410         m = get_sdma_mqd(mqd);
411         sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
412                                             m->sdma_queue_id);
413
414         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
415                 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
416
417         end_jiffies = msecs_to_jiffies(2000) + jiffies;
418         while (true) {
419                 data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
420                 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
421                         break;
422                 if (time_after(jiffies, end_jiffies)) {
423                         pr_err("SDMA RLC not idle in %s\n", __func__);
424                         return -ETIME;
425                 }
426                 usleep_range(500, 1000);
427         }
428
429         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET,
430                m->sdmax_rlcx_doorbell_offset);
431
432         data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
433                              ENABLE, 1);
434         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data);
435         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR,
436                                 m->sdmax_rlcx_rb_rptr);
437         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI,
438                                 m->sdmax_rlcx_rb_rptr_hi);
439
440         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
441         if (read_user_wptr(mm, wptr64, data64)) {
442                 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
443                        lower_32_bits(data64));
444                 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
445                        upper_32_bits(data64));
446         } else {
447                 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
448                        m->sdmax_rlcx_rb_rptr);
449                 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
450                        m->sdmax_rlcx_rb_rptr_hi);
451         }
452         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
453
454         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
455         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI,
456                         m->sdmax_rlcx_rb_base_hi);
457         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
458                         m->sdmax_rlcx_rb_rptr_addr_lo);
459         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
460                         m->sdmax_rlcx_rb_rptr_addr_hi);
461
462         data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
463                              RB_ENABLE, 1);
464         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data);
465
466         return 0;
467 }
468
469 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
470                              uint32_t engine_id, uint32_t queue_id,
471                              uint32_t (**dump)[2], uint32_t *n_regs)
472 {
473         struct amdgpu_device *adev = get_amdgpu_device(kgd);
474         uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
475                         engine_id, queue_id);
476         uint32_t i = 0, reg;
477 #undef HQD_N_REGS
478 #define HQD_N_REGS (19+6+7+10)
479
480         *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
481         if (*dump == NULL)
482                 return -ENOMEM;
483
484         for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
485                 DUMP_REG(sdma_rlc_reg_offset + reg);
486         for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
487                 DUMP_REG(sdma_rlc_reg_offset + reg);
488         for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
489              reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
490                 DUMP_REG(sdma_rlc_reg_offset + reg);
491         for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
492              reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
493                 DUMP_REG(sdma_rlc_reg_offset + reg);
494
495         WARN_ON_ONCE(i != HQD_N_REGS);
496         *n_regs = i;
497
498         return 0;
499 }
500
501 bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
502                                 uint32_t pipe_id, uint32_t queue_id)
503 {
504         struct amdgpu_device *adev = get_amdgpu_device(kgd);
505         uint32_t act;
506         bool retval = false;
507         uint32_t low, high;
508
509         acquire_queue(kgd, pipe_id, queue_id);
510         act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
511         if (act) {
512                 low = lower_32_bits(queue_address >> 8);
513                 high = upper_32_bits(queue_address >> 8);
514
515                 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
516                    high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
517                         retval = true;
518         }
519         release_queue(kgd);
520         return retval;
521 }
522
523 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
524 {
525         struct amdgpu_device *adev = get_amdgpu_device(kgd);
526         struct v9_sdma_mqd *m;
527         uint32_t sdma_rlc_reg_offset;
528         uint32_t sdma_rlc_rb_cntl;
529
530         m = get_sdma_mqd(mqd);
531         sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
532                                             m->sdma_queue_id);
533
534         sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
535
536         if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
537                 return true;
538
539         return false;
540 }
541
542 int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
543                                 enum kfd_preempt_type reset_type,
544                                 unsigned int utimeout, uint32_t pipe_id,
545                                 uint32_t queue_id)
546 {
547         struct amdgpu_device *adev = get_amdgpu_device(kgd);
548         enum hqd_dequeue_request_type type;
549         unsigned long end_jiffies;
550         uint32_t temp;
551         struct v9_mqd *m = get_mqd(mqd);
552
553         if (amdgpu_in_reset(adev))
554                 return -EIO;
555
556         acquire_queue(kgd, pipe_id, queue_id);
557
558         if (m->cp_hqd_vmid == 0)
559                 WREG32_FIELD15_RLC(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
560
561         switch (reset_type) {
562         case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
563                 type = DRAIN_PIPE;
564                 break;
565         case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
566                 type = RESET_WAVES;
567                 break;
568         default:
569                 type = DRAIN_PIPE;
570                 break;
571         }
572
573         WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
574
575         end_jiffies = (utimeout * HZ / 1000) + jiffies;
576         while (true) {
577                 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
578                 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
579                         break;
580                 if (time_after(jiffies, end_jiffies)) {
581                         pr_err("cp queue preemption time out.\n");
582                         release_queue(kgd);
583                         return -ETIME;
584                 }
585                 usleep_range(500, 1000);
586         }
587
588         release_queue(kgd);
589         return 0;
590 }
591
592 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
593                                 unsigned int utimeout)
594 {
595         struct amdgpu_device *adev = get_amdgpu_device(kgd);
596         struct v9_sdma_mqd *m;
597         uint32_t sdma_rlc_reg_offset;
598         uint32_t temp;
599         unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
600
601         m = get_sdma_mqd(mqd);
602         sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
603                                             m->sdma_queue_id);
604
605         temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
606         temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
607         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp);
608
609         while (true) {
610                 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
611                 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
612                         break;
613                 if (time_after(jiffies, end_jiffies)) {
614                         pr_err("SDMA RLC not idle in %s\n", __func__);
615                         return -ETIME;
616                 }
617                 usleep_range(500, 1000);
618         }
619
620         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0);
621         WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
622                 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) |
623                 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
624
625         m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR);
626         m->sdmax_rlcx_rb_rptr_hi =
627                 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI);
628
629         return 0;
630 }
631
632 bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd,
633                                         uint8_t vmid, uint16_t *p_pasid)
634 {
635         uint32_t value;
636         struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
637
638         value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
639                      + vmid);
640         *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK;
641
642         return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
643 }
644
645 int kgd_gfx_v9_address_watch_disable(struct kgd_dev *kgd)
646 {
647         return 0;
648 }
649
650 int kgd_gfx_v9_address_watch_execute(struct kgd_dev *kgd,
651                                         unsigned int watch_point_id,
652                                         uint32_t cntl_val,
653                                         uint32_t addr_hi,
654                                         uint32_t addr_lo)
655 {
656         return 0;
657 }
658
659 int kgd_gfx_v9_wave_control_execute(struct kgd_dev *kgd,
660                                         uint32_t gfx_index_val,
661                                         uint32_t sq_cmd)
662 {
663         struct amdgpu_device *adev = get_amdgpu_device(kgd);
664         uint32_t data = 0;
665
666         mutex_lock(&adev->grbm_idx_mutex);
667
668         WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val);
669         WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
670
671         data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
672                 INSTANCE_BROADCAST_WRITES, 1);
673         data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
674                 SH_BROADCAST_WRITES, 1);
675         data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
676                 SE_BROADCAST_WRITES, 1);
677
678         WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, data);
679         mutex_unlock(&adev->grbm_idx_mutex);
680
681         return 0;
682 }
683
684 uint32_t kgd_gfx_v9_address_watch_get_offset(struct kgd_dev *kgd,
685                                         unsigned int watch_point_id,
686                                         unsigned int reg_offset)
687 {
688         return 0;
689 }
690
691 void kgd_gfx_v9_set_vm_context_page_table_base(struct kgd_dev *kgd,
692                         uint32_t vmid, uint64_t page_table_base)
693 {
694         struct amdgpu_device *adev = get_amdgpu_device(kgd);
695
696         if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
697                 pr_err("trying to set page table base for wrong VMID %u\n",
698                        vmid);
699                 return;
700         }
701
702         adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
703
704         adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
705 }
706
707 static void lock_spi_csq_mutexes(struct amdgpu_device *adev)
708 {
709         mutex_lock(&adev->srbm_mutex);
710         mutex_lock(&adev->grbm_idx_mutex);
711
712 }
713
714 static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
715 {
716         mutex_unlock(&adev->grbm_idx_mutex);
717         mutex_unlock(&adev->srbm_mutex);
718 }
719
720 /**
721  * @get_wave_count: Read device registers to get number of waves in flight for
722  * a particular queue. The method also returns the VMID associated with the
723  * queue.
724  *
725  * @adev: Handle of device whose registers are to be read
726  * @queue_idx: Index of queue in the queue-map bit-field
727  * @wave_cnt: Output parameter updated with number of waves in flight
728  * @vmid: Output parameter updated with VMID of queue whose wave count
729  * is being collected
730  */
731 static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
732                 int *wave_cnt, int *vmid)
733 {
734         int pipe_idx;
735         int queue_slot;
736         unsigned int reg_val;
737
738         /*
739          * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
740          * parameters to read out waves in flight. Get VMID if there are
741          * non-zero waves in flight.
742          */
743         *vmid = 0xFF;
744         *wave_cnt = 0;
745         pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
746         queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
747         soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0);
748         reg_val = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
749                          queue_slot);
750         *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
751         if (*wave_cnt != 0)
752                 *vmid = (RREG32_SOC15(GC, 0, mmCP_HQD_VMID) &
753                          CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
754 }
755
756 /**
757  * @kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each
758  * shader engine and aggregates the number of waves that are in flight for the
759  * process whose pasid is provided as a parameter. The process could have ZERO
760  * or more queues running and submitting waves to compute units.
761  *
762  * @kgd: Handle of device from which to get number of waves in flight
763  * @pasid: Identifies the process for which this query call is invoked
764  * @wave_cnt: Output parameter updated with number of waves in flight that
765  * belong to process with given pasid
766  * @max_waves_per_cu: Output parameter updated with maximum number of waves
767  * possible per Compute Unit
768  *
769  * @note: It's possible that the device has too many queues (oversubscription)
770  * in which case a VMID could be remapped to a different PASID. This could lead
771  * to an iaccurate wave count. Following is a high-level sequence:
772  *    Time T1: vmid = getVmid(); vmid is associated with Pasid P1
773  *    Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2
774  * In the sequence above wave count obtained from time T1 will be incorrectly
775  * lost or added to total wave count.
776  *
777  * The registers that provide the waves in flight are:
778  *
779  *  SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a
780  *  queue is slotted, OFF if there is no queue. A process could have ZERO or
781  *  more queues slotted and submitting waves to be run on compute units. Even
782  *  when there is a queue it is possible there could be zero wave fronts, this
783  *  can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem
784  *  command
785  *
786  *  For each bit that is ON from above:
787  *
788  *    Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the
789  *    number of waves that are in flight for the queue at specified index. The
790  *    index ranges from 0 to 7.
791  *
792  *    If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
793  *    of the wave(s).
794  *
795  *    Determine if VMID from above step maps to pasid provided as parameter. If
796  *    it matches agrregate the wave count. That the VMID will not match pasid is
797  *    a normal condition i.e. a device is expected to support multiple queues
798  *    from multiple proceses.
799  *
800  *  Reading registers referenced above involves programming GRBM appropriately
801  */
802 static void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid,
803                 int *pasid_wave_cnt, int *max_waves_per_cu)
804 {
805         int qidx;
806         int vmid;
807         int se_idx;
808         int sh_idx;
809         int se_cnt;
810         int sh_cnt;
811         int wave_cnt;
812         int queue_map;
813         int pasid_tmp;
814         int max_queue_cnt;
815         int vmid_wave_cnt = 0;
816         struct amdgpu_device *adev;
817         DECLARE_BITMAP(cp_queue_bitmap, KGD_MAX_QUEUES);
818
819         adev = get_amdgpu_device(kgd);
820         lock_spi_csq_mutexes(adev);
821         soc15_grbm_select(adev, 1, 0, 0, 0);
822
823         /*
824          * Iterate through the shader engines and arrays of the device
825          * to get number of waves in flight
826          */
827         bitmap_complement(cp_queue_bitmap, adev->gfx.mec.queue_bitmap,
828                           KGD_MAX_QUEUES);
829         max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
830                         adev->gfx.mec.num_queue_per_pipe;
831         sh_cnt = adev->gfx.config.max_sh_per_se;
832         se_cnt = adev->gfx.config.max_shader_engines;
833         for (se_idx = 0; se_idx < se_cnt; se_idx++) {
834                 for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
835
836                         gfx_v9_0_select_se_sh(adev, se_idx, sh_idx, 0xffffffff);
837                         queue_map = RREG32(SOC15_REG_OFFSET(GC, 0,
838                                            mmSPI_CSQ_WF_ACTIVE_STATUS));
839
840                         /*
841                          * Assumption: queue map encodes following schema: four
842                          * pipes per each micro-engine, with each pipe mapping
843                          * eight queues. This schema is true for GFX9 devices
844                          * and must be verified for newer device families
845                          */
846                         for (qidx = 0; qidx < max_queue_cnt; qidx++) {
847
848                                 /* Skip qeueus that are not associated with
849                                  * compute functions
850                                  */
851                                 if (!test_bit(qidx, cp_queue_bitmap))
852                                         continue;
853
854                                 if (!(queue_map & (1 << qidx)))
855                                         continue;
856
857                                 /* Get number of waves in flight and aggregate them */
858                                 get_wave_count(adev, qidx, &wave_cnt, &vmid);
859                                 if (wave_cnt != 0) {
860                                         pasid_tmp =
861                                           RREG32(SOC15_REG_OFFSET(OSSSYS, 0,
862                                                  mmIH_VMID_0_LUT) + vmid);
863                                         if (pasid_tmp == pasid)
864                                                 vmid_wave_cnt += wave_cnt;
865                                 }
866                         }
867                 }
868         }
869
870         gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
871         soc15_grbm_select(adev, 0, 0, 0, 0);
872         unlock_spi_csq_mutexes(adev);
873
874         /* Update the output parameters and return */
875         *pasid_wave_cnt = vmid_wave_cnt;
876         *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
877                                 adev->gfx.cu_info.max_waves_per_simd;
878 }
879
880 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
881         .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
882         .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
883         .init_interrupts = kgd_gfx_v9_init_interrupts,
884         .hqd_load = kgd_gfx_v9_hqd_load,
885         .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
886         .hqd_sdma_load = kgd_hqd_sdma_load,
887         .hqd_dump = kgd_gfx_v9_hqd_dump,
888         .hqd_sdma_dump = kgd_hqd_sdma_dump,
889         .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
890         .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
891         .hqd_destroy = kgd_gfx_v9_hqd_destroy,
892         .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
893         .address_watch_disable = kgd_gfx_v9_address_watch_disable,
894         .address_watch_execute = kgd_gfx_v9_address_watch_execute,
895         .wave_control_execute = kgd_gfx_v9_wave_control_execute,
896         .address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset,
897         .get_atc_vmid_pasid_mapping_info =
898                         kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
899         .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
900         .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
901 };
This page took 0.089246 seconds and 4 git commands to generate.