]> Git Repo - J-linux.git/blob - drivers/gpu/drm/amd/amdkfd/kfd_debug.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / drivers / gpu / drm / amd / amdkfd / kfd_debug.c
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 #include <uapi/linux/kfd_sysfs.h>
29
30 #define MAX_WATCH_ADDRESSES     4
31
32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33                       unsigned int *queue_id,
34                       unsigned int *gpu_id,
35                       uint64_t exception_clear_mask,
36                       uint64_t *event_status)
37 {
38         struct process_queue_manager *pqm;
39         struct process_queue_node *pqn;
40         int i;
41
42         if (!(process && process->debug_trap_enabled))
43                 return -ENODATA;
44
45         mutex_lock(&process->event_mutex);
46         *event_status = 0;
47         *queue_id = 0;
48         *gpu_id = 0;
49
50         /* find and report queue events */
51         pqm = &process->pqm;
52         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53                 uint64_t tmp = process->exception_enable_mask;
54
55                 if (!pqn->q)
56                         continue;
57
58                 tmp &= pqn->q->properties.exception_status;
59
60                 if (!tmp)
61                         continue;
62
63                 *event_status = pqn->q->properties.exception_status;
64                 *queue_id = pqn->q->properties.queue_id;
65                 *gpu_id = pqn->q->device->id;
66                 pqn->q->properties.exception_status &= ~exception_clear_mask;
67                 goto out;
68         }
69
70         /* find and report device events */
71         for (i = 0; i < process->n_pdds; i++) {
72                 struct kfd_process_device *pdd = process->pdds[i];
73                 uint64_t tmp = process->exception_enable_mask
74                                                 & pdd->exception_status;
75
76                 if (!tmp)
77                         continue;
78
79                 *event_status = pdd->exception_status;
80                 *gpu_id = pdd->dev->id;
81                 pdd->exception_status &= ~exception_clear_mask;
82                 goto out;
83         }
84
85         /* report process events */
86         if (process->exception_enable_mask & process->exception_status) {
87                 *event_status = process->exception_status;
88                 process->exception_status &= ~exception_clear_mask;
89         }
90
91 out:
92         mutex_unlock(&process->event_mutex);
93         return *event_status ? 0 : -EAGAIN;
94 }
95
96 void debug_event_write_work_handler(struct work_struct *work)
97 {
98         struct kfd_process *process;
99
100         static const char write_data = '.';
101         loff_t pos = 0;
102
103         process = container_of(work,
104                         struct kfd_process,
105                         debug_event_workarea);
106
107         if (process->debug_trap_enabled && process->dbg_ev_file)
108                 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
109 }
110
111 /* update process/device/queue exception status, write to descriptor
112  * only if exception_status is enabled.
113  */
114 bool kfd_dbg_ev_raise(uint64_t event_mask,
115                         struct kfd_process *process, struct kfd_node *dev,
116                         unsigned int source_id, bool use_worker,
117                         void *exception_data, size_t exception_data_size)
118 {
119         struct process_queue_manager *pqm;
120         struct process_queue_node *pqn;
121         int i;
122         static const char write_data = '.';
123         loff_t pos = 0;
124         bool is_subscribed = true;
125
126         if (!(process && process->debug_trap_enabled))
127                 return false;
128
129         mutex_lock(&process->event_mutex);
130
131         if (event_mask & KFD_EC_MASK_DEVICE) {
132                 for (i = 0; i < process->n_pdds; i++) {
133                         struct kfd_process_device *pdd = process->pdds[i];
134
135                         if (pdd->dev != dev)
136                                 continue;
137
138                         pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
139
140                         if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141                                 if (!pdd->vm_fault_exc_data) {
142                                         pdd->vm_fault_exc_data = kmemdup(
143                                                         exception_data,
144                                                         exception_data_size,
145                                                         GFP_KERNEL);
146                                         if (!pdd->vm_fault_exc_data)
147                                                 pr_debug("Failed to allocate exception data memory");
148                                 } else {
149                                         pr_debug("Debugger exception data not saved\n");
150                                         print_hex_dump_bytes("exception data: ",
151                                                         DUMP_PREFIX_OFFSET,
152                                                         exception_data,
153                                                         exception_data_size);
154                                 }
155                         }
156                         break;
157                 }
158         } else if (event_mask & KFD_EC_MASK_PROCESS) {
159                 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
160         } else {
161                 pqm = &process->pqm;
162                 list_for_each_entry(pqn, &pqm->queues,
163                                 process_queue_list) {
164                         int target_id;
165
166                         if (!pqn->q)
167                                 continue;
168
169                         target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170                                         pqn->q->properties.queue_id :
171                                                         pqn->q->doorbell_id;
172
173                         if (pqn->q->device != dev || target_id != source_id)
174                                 continue;
175
176                         pqn->q->properties.exception_status |= event_mask;
177                         break;
178                 }
179         }
180
181         if (process->exception_enable_mask & event_mask) {
182                 if (use_worker)
183                         schedule_work(&process->debug_event_workarea);
184                 else
185                         kernel_write(process->dbg_ev_file,
186                                         &write_data,
187                                         1,
188                                         &pos);
189         } else {
190                 is_subscribed = false;
191         }
192
193         mutex_unlock(&process->event_mutex);
194
195         return is_subscribed;
196 }
197
198 /* set pending event queue entry from ring entry  */
199 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
200                                    unsigned int pasid,
201                                    uint32_t doorbell_id,
202                                    uint64_t trap_mask,
203                                    void *exception_data,
204                                    size_t exception_data_size)
205 {
206         struct kfd_process *p;
207         bool signaled_to_debugger_or_runtime = false;
208
209         p = kfd_lookup_process_by_pasid(pasid);
210
211         if (!p)
212                 return false;
213
214         if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
215                               exception_data, exception_data_size)) {
216                 struct process_queue_manager *pqm;
217                 struct process_queue_node *pqn;
218
219                 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
220                        p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
221                         mutex_lock(&p->mutex);
222
223                         pqm = &p->pqm;
224                         list_for_each_entry(pqn, &pqm->queues,
225                                                         process_queue_list) {
226
227                                 if (!(pqn->q && pqn->q->device == dev &&
228                                       pqn->q->doorbell_id == doorbell_id))
229                                         continue;
230
231                                 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
232                                                               trap_mask);
233
234                                 signaled_to_debugger_or_runtime = true;
235
236                                 break;
237                         }
238
239                         mutex_unlock(&p->mutex);
240                 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
241                         kfd_dqm_evict_pasid(dev->dqm, p->pasid);
242                         kfd_signal_vm_fault_event(dev, p->pasid, NULL,
243                                                         exception_data);
244
245                         signaled_to_debugger_or_runtime = true;
246                 }
247         } else {
248                 signaled_to_debugger_or_runtime = true;
249         }
250
251         kfd_unref_process(p);
252
253         return signaled_to_debugger_or_runtime;
254 }
255
256 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
257                                         unsigned int dev_id,
258                                         unsigned int queue_id,
259                                         uint64_t error_reason)
260 {
261         if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262                 struct kfd_process_device *pdd = NULL;
263                 struct kfd_hsa_memory_exception_data *data;
264                 int i;
265
266                 for (i = 0; i < p->n_pdds; i++) {
267                         if (p->pdds[i]->dev->id == dev_id) {
268                                 pdd = p->pdds[i];
269                                 break;
270                         }
271                 }
272
273                 if (!pdd)
274                         return -ENODEV;
275
276                 data = (struct kfd_hsa_memory_exception_data *)
277                                                 pdd->vm_fault_exc_data;
278
279                 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
280                 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
281                 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
282         }
283
284         if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
285                 /*
286                  * block should only happen after the debugger receives runtime
287                  * enable notice.
288                  */
289                 up(&p->runtime_enable_sema);
290                 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
291         }
292
293         if (error_reason)
294                 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
295
296         return 0;
297 }
298
299 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
300 {
301         struct mqd_update_info minfo = {0};
302         int err;
303
304         if (!q)
305                 return 0;
306
307         if (!kfd_dbg_has_cwsr_workaround(q->device))
308                 return 0;
309
310         if (enable && q->properties.is_user_cu_masked)
311                 return -EBUSY;
312
313         minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
314
315         q->properties.is_dbg_wa = enable;
316         err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
317         if (err)
318                 q->properties.is_dbg_wa = false;
319
320         return err;
321 }
322
323 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
324 {
325         struct process_queue_manager *pqm = &target->pqm;
326         struct process_queue_node *pqn;
327         int r = 0;
328
329         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330                 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
331                 if (enable && r)
332                         goto unwind;
333         }
334
335         return 0;
336
337 unwind:
338         list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339                 kfd_dbg_set_queue_workaround(pqn->q, false);
340
341         if (enable)
342                 target->runtime_info.runtime_state = r == -EBUSY ?
343                                 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344                                 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
345
346         return r;
347 }
348
349 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
350 {
351         uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352         uint32_t flags = pdd->process->dbg_flags;
353
354         if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
355                 return 0;
356
357         return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
358                                                 pdd->watch_points, flags, sq_trap_en);
359 }
360
361 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
362 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
363 {
364         int i;
365
366         *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
367
368         spin_lock(&pdd->dev->watch_points_lock);
369
370         for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
371                 /* device watchpoint in use so skip */
372                 if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
373                         continue;
374
375                 pdd->alloc_watch_ids |= 0x1 << i;
376                 pdd->dev->alloc_watch_ids |= 0x1 << i;
377                 *watch_id = i;
378                 spin_unlock(&pdd->dev->watch_points_lock);
379                 return 0;
380         }
381
382         spin_unlock(&pdd->dev->watch_points_lock);
383
384         return -ENOMEM;
385 }
386
387 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
388 {
389         spin_lock(&pdd->dev->watch_points_lock);
390
391         /* process owns device watch point so safe to clear */
392         if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
393                 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
394                 pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
395         }
396
397         spin_unlock(&pdd->dev->watch_points_lock);
398 }
399
400 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
401 {
402         bool owns_watch_id = false;
403
404         spin_lock(&pdd->dev->watch_points_lock);
405         owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
406                         ((pdd->alloc_watch_ids >> watch_id) & 0x1);
407
408         spin_unlock(&pdd->dev->watch_points_lock);
409
410         return owns_watch_id;
411 }
412
413 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
414                                         uint32_t watch_id)
415 {
416         int r;
417
418         if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
419                 return -EINVAL;
420
421         if (!pdd->dev->kfd->shared_resources.enable_mes) {
422                 r = debug_lock_and_unmap(pdd->dev->dqm);
423                 if (r)
424                         return r;
425         }
426
427         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
428         pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
429                                                         pdd->dev->adev,
430                                                         watch_id);
431         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
432
433         if (!pdd->dev->kfd->shared_resources.enable_mes)
434                 r = debug_map_and_unlock(pdd->dev->dqm);
435         else
436                 r = kfd_dbg_set_mes_debug_mode(pdd, true);
437
438         kfd_dbg_clear_dev_watch_id(pdd, watch_id);
439
440         return r;
441 }
442
443 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
444                                         uint64_t watch_address,
445                                         uint32_t watch_address_mask,
446                                         uint32_t *watch_id,
447                                         uint32_t watch_mode)
448 {
449         int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
450         uint32_t xcc_mask = pdd->dev->xcc_mask;
451
452         if (r)
453                 return r;
454
455         if (!pdd->dev->kfd->shared_resources.enable_mes) {
456                 r = debug_lock_and_unmap(pdd->dev->dqm);
457                 if (r) {
458                         kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
459                         return r;
460                 }
461         }
462
463         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
464         for_each_inst(xcc_id, xcc_mask)
465                 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
466                                 pdd->dev->adev,
467                                 watch_address,
468                                 watch_address_mask,
469                                 *watch_id,
470                                 watch_mode,
471                                 pdd->dev->vm_info.last_vmid_kfd,
472                                 xcc_id);
473         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
474
475         if (!pdd->dev->kfd->shared_resources.enable_mes)
476                 r = debug_map_and_unlock(pdd->dev->dqm);
477         else
478                 r = kfd_dbg_set_mes_debug_mode(pdd, true);
479
480         /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
481         if (r)
482                 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
483
484         return 0;
485 }
486
487 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
488 {
489         int i, j;
490
491         for (i = 0; i < target->n_pdds; i++)
492                 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
493                         kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
494 }
495
496 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
497 {
498         uint32_t prev_flags = target->dbg_flags;
499         int i, r = 0, rewind_count = 0;
500
501         for (i = 0; i < target->n_pdds; i++) {
502                 struct kfd_topology_device *topo_dev =
503                                 kfd_topology_device_by_id(target->pdds[i]->dev->id);
504                 uint32_t caps = topo_dev->node_props.capability;
505
506                 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
507                         (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
508                         *flags = prev_flags;
509                         return -EACCES;
510                 }
511
512                 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
513                     (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
514                         *flags = prev_flags;
515                         return -EACCES;
516                 }
517         }
518
519         target->dbg_flags = *flags;
520         *flags = prev_flags;
521         for (i = 0; i < target->n_pdds; i++) {
522                 struct kfd_process_device *pdd = target->pdds[i];
523
524                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
525                         continue;
526
527                 if (!pdd->dev->kfd->shared_resources.enable_mes)
528                         r = debug_refresh_runlist(pdd->dev->dqm);
529                 else
530                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
531
532                 if (r) {
533                         target->dbg_flags = prev_flags;
534                         break;
535                 }
536
537                 rewind_count++;
538         }
539
540         /* Rewind flags */
541         if (r) {
542                 target->dbg_flags = prev_flags;
543
544                 for (i = 0; i < rewind_count; i++) {
545                         struct kfd_process_device *pdd = target->pdds[i];
546
547                         if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
548                                 continue;
549
550                         if (!pdd->dev->kfd->shared_resources.enable_mes)
551                                 debug_refresh_runlist(pdd->dev->dqm);
552                         else
553                                 kfd_dbg_set_mes_debug_mode(pdd, true);
554                 }
555         }
556
557         return r;
558 }
559
560 /* kfd_dbg_trap_deactivate:
561  *      target: target process
562  *      unwind: If this is unwinding a failed kfd_dbg_trap_enable()
563  *      unwind_count:
564  *              If unwind == true, how far down the pdd list we need
565  *                              to unwind
566  *              else: ignored
567  */
568 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
569 {
570         int i;
571
572         if (!unwind) {
573                 uint32_t flags = 0;
574                 int resume_count = resume_queues(target, 0, NULL);
575
576                 if (resume_count)
577                         pr_debug("Resumed %d queues\n", resume_count);
578
579                 cancel_work_sync(&target->debug_event_workarea);
580                 kfd_dbg_clear_process_address_watch(target);
581                 kfd_dbg_trap_set_wave_launch_mode(target, 0);
582
583                 kfd_dbg_trap_set_flags(target, &flags);
584         }
585
586         for (i = 0; i < target->n_pdds; i++) {
587                 struct kfd_process_device *pdd = target->pdds[i];
588
589                 /* If this is an unwind, and we have unwound the required
590                  * enable calls on the pdd list, we need to stop now
591                  * otherwise we may mess up another debugger session.
592                  */
593                 if (unwind && i == unwind_count)
594                         break;
595
596                 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
597
598                 /* GFX off is already disabled by debug activate if not RLC restore supported. */
599                 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
600                         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
601                 pdd->spi_dbg_override =
602                                 pdd->dev->kfd2kgd->disable_debug_trap(
603                                 pdd->dev->adev,
604                                 target->runtime_info.ttmp_setup,
605                                 pdd->dev->vm_info.last_vmid_kfd);
606                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
607
608                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
609                                 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
610                         pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
611
612                 if (!pdd->dev->kfd->shared_resources.enable_mes)
613                         debug_refresh_runlist(pdd->dev->dqm);
614                 else
615                         kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
616         }
617
618         kfd_dbg_set_workaround(target, false);
619 }
620
621 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
622 {
623         struct process_queue_manager *pqm;
624         struct process_queue_node *pqn;
625         int i;
626
627         for (i = 0; i < target->n_pdds; i++) {
628                 struct kfd_process_device *pdd = target->pdds[i];
629
630                 kfd_process_drain_interrupts(pdd);
631
632                 pdd->exception_status = 0;
633         }
634
635         pqm = &target->pqm;
636         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
637                 if (!pqn->q)
638                         continue;
639
640                 pqn->q->properties.exception_status = 0;
641         }
642
643         target->exception_status = 0;
644 }
645
646 int kfd_dbg_trap_disable(struct kfd_process *target)
647 {
648         if (!target->debug_trap_enabled)
649                 return 0;
650
651         /*
652          * Defer deactivation to runtime if runtime not enabled otherwise reset
653          * attached running target runtime state to enable for re-attach.
654          */
655         if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
656                 kfd_dbg_trap_deactivate(target, false, 0);
657         else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
658                 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
659
660         cancel_work_sync(&target->debug_event_workarea);
661         fput(target->dbg_ev_file);
662         target->dbg_ev_file = NULL;
663
664         if (target->debugger_process) {
665                 atomic_dec(&target->debugger_process->debugged_process_count);
666                 target->debugger_process = NULL;
667         }
668
669         target->debug_trap_enabled = false;
670         kfd_dbg_clean_exception_status(target);
671         kfd_unref_process(target);
672
673         return 0;
674 }
675
676 int kfd_dbg_trap_activate(struct kfd_process *target)
677 {
678         int i, r = 0;
679
680         r = kfd_dbg_set_workaround(target, true);
681         if (r)
682                 return r;
683
684         for (i = 0; i < target->n_pdds; i++) {
685                 struct kfd_process_device *pdd = target->pdds[i];
686
687                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
688                         r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
689
690                         if (r) {
691                                 target->runtime_info.runtime_state = (r == -EBUSY) ?
692                                                         DEBUG_RUNTIME_STATE_ENABLED_BUSY :
693                                                         DEBUG_RUNTIME_STATE_ENABLED_ERROR;
694
695                                 goto unwind_err;
696                         }
697                 }
698
699                 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
700                  * If RLC restore of debug registers is not supported and runtime enable
701                  * hasn't done so already on ttmp setup request, restore the trap config registers.
702                  *
703                  * If RLC restore of debug registers is not supported, keep gfx off disabled for
704                  * the debug session.
705                  */
706                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
707                 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
708                                                 target->runtime_info.ttmp_setup))
709                         pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
710                                                                 pdd->dev->vm_info.last_vmid_kfd);
711
712                 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
713                                         pdd->dev->adev,
714                                         false,
715                                         pdd->dev->vm_info.last_vmid_kfd);
716
717                 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
718                         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
719
720                 /*
721                  * Setting the debug flag in the trap handler requires that the TMA has been
722                  * allocated, which occurs during CWSR initialization.
723                  * In the event that CWSR has not been initialized at this point, setting the
724                  * flag will be called again during CWSR initialization if the target process
725                  * is still debug enabled.
726                  */
727                 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
728
729                 if (!pdd->dev->kfd->shared_resources.enable_mes)
730                         r = debug_refresh_runlist(pdd->dev->dqm);
731                 else
732                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
733
734                 if (r) {
735                         target->runtime_info.runtime_state =
736                                         DEBUG_RUNTIME_STATE_ENABLED_ERROR;
737                         goto unwind_err;
738                 }
739         }
740
741         return 0;
742
743 unwind_err:
744         /* Enabling debug failed, we need to disable on
745          * all GPUs so the enable is all or nothing.
746          */
747         kfd_dbg_trap_deactivate(target, true, i);
748         return r;
749 }
750
751 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
752                         void __user *runtime_info, uint32_t *runtime_size)
753 {
754         struct file *f;
755         uint32_t copy_size;
756         int i, r = 0;
757
758         if (target->debug_trap_enabled)
759                 return -EALREADY;
760
761         /* Enable pre-checks */
762         for (i = 0; i < target->n_pdds; i++) {
763                 struct kfd_process_device *pdd = target->pdds[i];
764
765                 if (!KFD_IS_SOC15(pdd->dev))
766                         return -ENODEV;
767
768                 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
769                                          kfd_dbg_has_cwsr_workaround(pdd->dev)))
770                         return -EBUSY;
771         }
772
773         copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
774
775         f = fget(fd);
776         if (!f) {
777                 pr_err("Failed to get file for (%i)\n", fd);
778                 return -EBADF;
779         }
780
781         target->dbg_ev_file = f;
782
783         /* defer activation to runtime if not runtime enabled */
784         if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
785                 kfd_dbg_trap_activate(target);
786
787         /* We already hold the process reference but hold another one for the
788          * debug session.
789          */
790         kref_get(&target->ref);
791         target->debug_trap_enabled = true;
792
793         if (target->debugger_process)
794                 atomic_inc(&target->debugger_process->debugged_process_count);
795
796         if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
797                 kfd_dbg_trap_deactivate(target, false, 0);
798                 r = -EFAULT;
799         }
800
801         *runtime_size = sizeof(target->runtime_info);
802
803         return r;
804 }
805
806 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
807                                                 uint32_t trap_override,
808                                                 uint32_t trap_mask_request,
809                                                 uint32_t *trap_mask_supported)
810 {
811         int i = 0;
812
813         *trap_mask_supported = 0xffffffff;
814
815         for (i = 0; i < p->n_pdds; i++) {
816                 struct kfd_process_device *pdd = p->pdds[i];
817                 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
818                                                                 pdd->dev->adev,
819                                                                 trap_override,
820                                                                 trap_mask_supported);
821
822                 if (err)
823                         return err;
824         }
825
826         if (trap_mask_request & ~*trap_mask_supported)
827                 return -EACCES;
828
829         return 0;
830 }
831
832 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
833                                         uint32_t trap_override,
834                                         uint32_t trap_mask_bits,
835                                         uint32_t trap_mask_request,
836                                         uint32_t *trap_mask_prev,
837                                         uint32_t *trap_mask_supported)
838 {
839         int r = 0, i;
840
841         r = kfd_dbg_validate_trap_override_request(target,
842                                                 trap_override,
843                                                 trap_mask_request,
844                                                 trap_mask_supported);
845
846         if (r)
847                 return r;
848
849         for (i = 0; i < target->n_pdds; i++) {
850                 struct kfd_process_device *pdd = target->pdds[i];
851
852                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
853                 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
854                                 pdd->dev->adev,
855                                 pdd->dev->vm_info.last_vmid_kfd,
856                                 trap_override,
857                                 trap_mask_bits,
858                                 trap_mask_request,
859                                 trap_mask_prev,
860                                 pdd->spi_dbg_override);
861                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
862
863                 if (!pdd->dev->kfd->shared_resources.enable_mes)
864                         r = debug_refresh_runlist(pdd->dev->dqm);
865                 else
866                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
867
868                 if (r)
869                         break;
870         }
871
872         return r;
873 }
874
875 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
876                                         uint8_t wave_launch_mode)
877 {
878         int r = 0, i;
879
880         if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
881                         wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
882                         wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
883                 return -EINVAL;
884
885         for (i = 0; i < target->n_pdds; i++) {
886                 struct kfd_process_device *pdd = target->pdds[i];
887
888                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
889                 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
890                                 pdd->dev->adev,
891                                 wave_launch_mode,
892                                 pdd->dev->vm_info.last_vmid_kfd);
893                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
894
895                 if (!pdd->dev->kfd->shared_resources.enable_mes)
896                         r = debug_refresh_runlist(pdd->dev->dqm);
897                 else
898                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
899
900                 if (r)
901                         break;
902         }
903
904         return r;
905 }
906
907 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
908                 uint32_t source_id,
909                 uint32_t exception_code,
910                 bool clear_exception,
911                 void __user *info,
912                 uint32_t *info_size)
913 {
914         bool found = false;
915         int r = 0;
916         uint32_t copy_size, actual_info_size = 0;
917         uint64_t *exception_status_ptr = NULL;
918
919         if (!target)
920                 return -EINVAL;
921
922         if (!info || !info_size)
923                 return -EINVAL;
924
925         mutex_lock(&target->event_mutex);
926
927         if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
928                 /* Per queue exceptions */
929                 struct queue *queue = NULL;
930                 int i;
931
932                 for (i = 0; i < target->n_pdds; i++) {
933                         struct kfd_process_device *pdd = target->pdds[i];
934                         struct qcm_process_device *qpd = &pdd->qpd;
935
936                         list_for_each_entry(queue, &qpd->queues_list, list) {
937                                 if (!found && queue->properties.queue_id == source_id) {
938                                         found = true;
939                                         break;
940                                 }
941                         }
942                         if (found)
943                                 break;
944                 }
945
946                 if (!found) {
947                         r = -EINVAL;
948                         goto out;
949                 }
950
951                 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
952                         r = -ENODATA;
953                         goto out;
954                 }
955                 exception_status_ptr = &queue->properties.exception_status;
956         } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
957                 /* Per device exceptions */
958                 struct kfd_process_device *pdd = NULL;
959                 int i;
960
961                 for (i = 0; i < target->n_pdds; i++) {
962                         pdd = target->pdds[i];
963                         if (pdd->dev->id == source_id) {
964                                 found = true;
965                                 break;
966                         }
967                 }
968
969                 if (!found) {
970                         r = -EINVAL;
971                         goto out;
972                 }
973
974                 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
975                         r = -ENODATA;
976                         goto out;
977                 }
978
979                 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
980                         copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
981
982                         if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
983                                 r = -EFAULT;
984                                 goto out;
985                         }
986                         actual_info_size = pdd->vm_fault_exc_data_size;
987                         if (clear_exception) {
988                                 kfree(pdd->vm_fault_exc_data);
989                                 pdd->vm_fault_exc_data = NULL;
990                                 pdd->vm_fault_exc_data_size = 0;
991                         }
992                 }
993                 exception_status_ptr = &pdd->exception_status;
994         } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
995                 /* Per process exceptions */
996                 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
997                         r = -ENODATA;
998                         goto out;
999                 }
1000
1001                 if (exception_code == EC_PROCESS_RUNTIME) {
1002                         copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1003
1004                         if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1005                                 r = -EFAULT;
1006                                 goto out;
1007                         }
1008
1009                         actual_info_size = sizeof(target->runtime_info);
1010                 }
1011
1012                 exception_status_ptr = &target->exception_status;
1013         } else {
1014                 pr_debug("Bad exception type [%i]\n", exception_code);
1015                 r = -EINVAL;
1016                 goto out;
1017         }
1018
1019         *info_size = actual_info_size;
1020         if (clear_exception)
1021                 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1022 out:
1023         mutex_unlock(&target->event_mutex);
1024         return r;
1025 }
1026
1027 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1028                 uint64_t exception_clear_mask,
1029                 void __user *user_info,
1030                 uint32_t *number_of_device_infos,
1031                 uint32_t *entry_size)
1032 {
1033         struct kfd_dbg_device_info_entry device_info;
1034         uint32_t tmp_entry_size, tmp_num_devices;
1035         int i, r = 0;
1036
1037         if (!(target && user_info && number_of_device_infos && entry_size))
1038                 return -EINVAL;
1039
1040         tmp_entry_size = *entry_size;
1041
1042         tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1043         *number_of_device_infos = target->n_pdds;
1044         *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1045
1046         if (!tmp_num_devices)
1047                 return 0;
1048
1049         memset(&device_info, 0, sizeof(device_info));
1050
1051         mutex_lock(&target->event_mutex);
1052
1053         /* Run over all pdd of the process */
1054         for (i = 0; i < tmp_num_devices; i++) {
1055                 struct kfd_process_device *pdd = target->pdds[i];
1056                 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1057
1058                 device_info.gpu_id = pdd->dev->id;
1059                 device_info.exception_status = pdd->exception_status;
1060                 device_info.lds_base = pdd->lds_base;
1061                 device_info.lds_limit = pdd->lds_limit;
1062                 device_info.scratch_base = pdd->scratch_base;
1063                 device_info.scratch_limit = pdd->scratch_limit;
1064                 device_info.gpuvm_base = pdd->gpuvm_base;
1065                 device_info.gpuvm_limit = pdd->gpuvm_limit;
1066                 device_info.location_id = topo_dev->node_props.location_id;
1067                 device_info.vendor_id = topo_dev->node_props.vendor_id;
1068                 device_info.device_id = topo_dev->node_props.device_id;
1069                 device_info.revision_id = pdd->dev->adev->pdev->revision;
1070                 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1071                 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1072                 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1073                 device_info.gfx_target_version =
1074                         topo_dev->node_props.gfx_target_version;
1075                 device_info.simd_count = topo_dev->node_props.simd_count;
1076                 device_info.max_waves_per_simd =
1077                         topo_dev->node_props.max_waves_per_simd;
1078                 device_info.array_count = topo_dev->node_props.array_count;
1079                 device_info.simd_arrays_per_engine =
1080                         topo_dev->node_props.simd_arrays_per_engine;
1081                 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1082                 device_info.capability = topo_dev->node_props.capability;
1083                 device_info.debug_prop = topo_dev->node_props.debug_prop;
1084
1085                 if (exception_clear_mask)
1086                         pdd->exception_status &= ~exception_clear_mask;
1087
1088                 if (copy_to_user(user_info, &device_info, *entry_size)) {
1089                         r = -EFAULT;
1090                         break;
1091                 }
1092
1093                 user_info += tmp_entry_size;
1094         }
1095
1096         mutex_unlock(&target->event_mutex);
1097
1098         return r;
1099 }
1100
1101 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1102                                         uint64_t exception_set_mask)
1103 {
1104         uint64_t found_mask = 0;
1105         struct process_queue_manager *pqm;
1106         struct process_queue_node *pqn;
1107         static const char write_data = '.';
1108         loff_t pos = 0;
1109         int i;
1110
1111         mutex_lock(&target->event_mutex);
1112
1113         found_mask |= target->exception_status;
1114
1115         pqm = &target->pqm;
1116         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1117                 if (!pqn->q)
1118                         continue;
1119
1120                 found_mask |= pqn->q->properties.exception_status;
1121         }
1122
1123         for (i = 0; i < target->n_pdds; i++) {
1124                 struct kfd_process_device *pdd = target->pdds[i];
1125
1126                 found_mask |= pdd->exception_status;
1127         }
1128
1129         if (exception_set_mask & found_mask)
1130                 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1131
1132         target->exception_enable_mask = exception_set_mask;
1133
1134         mutex_unlock(&target->event_mutex);
1135 }
This page took 0.09268 seconds and 4 git commands to generate.