]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdkfd/kfd_debug.c
net: wan: Add framer framework support
[linux.git] / drivers / gpu / drm / amd / amdkfd / kfd_debug.c
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28
29 #define MAX_WATCH_ADDRESSES     4
30
31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32                       unsigned int *queue_id,
33                       unsigned int *gpu_id,
34                       uint64_t exception_clear_mask,
35                       uint64_t *event_status)
36 {
37         struct process_queue_manager *pqm;
38         struct process_queue_node *pqn;
39         int i;
40
41         if (!(process && process->debug_trap_enabled))
42                 return -ENODATA;
43
44         mutex_lock(&process->event_mutex);
45         *event_status = 0;
46         *queue_id = 0;
47         *gpu_id = 0;
48
49         /* find and report queue events */
50         pqm = &process->pqm;
51         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52                 uint64_t tmp = process->exception_enable_mask;
53
54                 if (!pqn->q)
55                         continue;
56
57                 tmp &= pqn->q->properties.exception_status;
58
59                 if (!tmp)
60                         continue;
61
62                 *event_status = pqn->q->properties.exception_status;
63                 *queue_id = pqn->q->properties.queue_id;
64                 *gpu_id = pqn->q->device->id;
65                 pqn->q->properties.exception_status &= ~exception_clear_mask;
66                 goto out;
67         }
68
69         /* find and report device events */
70         for (i = 0; i < process->n_pdds; i++) {
71                 struct kfd_process_device *pdd = process->pdds[i];
72                 uint64_t tmp = process->exception_enable_mask
73                                                 & pdd->exception_status;
74
75                 if (!tmp)
76                         continue;
77
78                 *event_status = pdd->exception_status;
79                 *gpu_id = pdd->dev->id;
80                 pdd->exception_status &= ~exception_clear_mask;
81                 goto out;
82         }
83
84         /* report process events */
85         if (process->exception_enable_mask & process->exception_status) {
86                 *event_status = process->exception_status;
87                 process->exception_status &= ~exception_clear_mask;
88         }
89
90 out:
91         mutex_unlock(&process->event_mutex);
92         return *event_status ? 0 : -EAGAIN;
93 }
94
95 void debug_event_write_work_handler(struct work_struct *work)
96 {
97         struct kfd_process *process;
98
99         static const char write_data = '.';
100         loff_t pos = 0;
101
102         process = container_of(work,
103                         struct kfd_process,
104                         debug_event_workarea);
105
106         kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
107 }
108
109 /* update process/device/queue exception status, write to descriptor
110  * only if exception_status is enabled.
111  */
112 bool kfd_dbg_ev_raise(uint64_t event_mask,
113                         struct kfd_process *process, struct kfd_node *dev,
114                         unsigned int source_id, bool use_worker,
115                         void *exception_data, size_t exception_data_size)
116 {
117         struct process_queue_manager *pqm;
118         struct process_queue_node *pqn;
119         int i;
120         static const char write_data = '.';
121         loff_t pos = 0;
122         bool is_subscribed = true;
123
124         if (!(process && process->debug_trap_enabled))
125                 return false;
126
127         mutex_lock(&process->event_mutex);
128
129         if (event_mask & KFD_EC_MASK_DEVICE) {
130                 for (i = 0; i < process->n_pdds; i++) {
131                         struct kfd_process_device *pdd = process->pdds[i];
132
133                         if (pdd->dev != dev)
134                                 continue;
135
136                         pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
137
138                         if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
139                                 if (!pdd->vm_fault_exc_data) {
140                                         pdd->vm_fault_exc_data = kmemdup(
141                                                         exception_data,
142                                                         exception_data_size,
143                                                         GFP_KERNEL);
144                                         if (!pdd->vm_fault_exc_data)
145                                                 pr_debug("Failed to allocate exception data memory");
146                                 } else {
147                                         pr_debug("Debugger exception data not saved\n");
148                                         print_hex_dump_bytes("exception data: ",
149                                                         DUMP_PREFIX_OFFSET,
150                                                         exception_data,
151                                                         exception_data_size);
152                                 }
153                         }
154                         break;
155                 }
156         } else if (event_mask & KFD_EC_MASK_PROCESS) {
157                 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
158         } else {
159                 pqm = &process->pqm;
160                 list_for_each_entry(pqn, &pqm->queues,
161                                 process_queue_list) {
162                         int target_id;
163
164                         if (!pqn->q)
165                                 continue;
166
167                         target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
168                                         pqn->q->properties.queue_id :
169                                                         pqn->q->doorbell_id;
170
171                         if (pqn->q->device != dev || target_id != source_id)
172                                 continue;
173
174                         pqn->q->properties.exception_status |= event_mask;
175                         break;
176                 }
177         }
178
179         if (process->exception_enable_mask & event_mask) {
180                 if (use_worker)
181                         schedule_work(&process->debug_event_workarea);
182                 else
183                         kernel_write(process->dbg_ev_file,
184                                         &write_data,
185                                         1,
186                                         &pos);
187         } else {
188                 is_subscribed = false;
189         }
190
191         mutex_unlock(&process->event_mutex);
192
193         return is_subscribed;
194 }
195
196 /* set pending event queue entry from ring entry  */
197 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
198                                    unsigned int pasid,
199                                    uint32_t doorbell_id,
200                                    uint64_t trap_mask,
201                                    void *exception_data,
202                                    size_t exception_data_size)
203 {
204         struct kfd_process *p;
205         bool signaled_to_debugger_or_runtime = false;
206
207         p = kfd_lookup_process_by_pasid(pasid);
208
209         if (!p)
210                 return false;
211
212         if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
213                               exception_data, exception_data_size)) {
214                 struct process_queue_manager *pqm;
215                 struct process_queue_node *pqn;
216
217                 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
218                        p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
219                         mutex_lock(&p->mutex);
220
221                         pqm = &p->pqm;
222                         list_for_each_entry(pqn, &pqm->queues,
223                                                         process_queue_list) {
224
225                                 if (!(pqn->q && pqn->q->device == dev &&
226                                       pqn->q->doorbell_id == doorbell_id))
227                                         continue;
228
229                                 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
230                                                               trap_mask);
231
232                                 signaled_to_debugger_or_runtime = true;
233
234                                 break;
235                         }
236
237                         mutex_unlock(&p->mutex);
238                 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
239                         kfd_dqm_evict_pasid(dev->dqm, p->pasid);
240                         kfd_signal_vm_fault_event(dev, p->pasid, NULL,
241                                                         exception_data);
242
243                         signaled_to_debugger_or_runtime = true;
244                 }
245         } else {
246                 signaled_to_debugger_or_runtime = true;
247         }
248
249         kfd_unref_process(p);
250
251         return signaled_to_debugger_or_runtime;
252 }
253
254 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
255                                         unsigned int dev_id,
256                                         unsigned int queue_id,
257                                         uint64_t error_reason)
258 {
259         if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
260                 struct kfd_process_device *pdd = NULL;
261                 struct kfd_hsa_memory_exception_data *data;
262                 int i;
263
264                 for (i = 0; i < p->n_pdds; i++) {
265                         if (p->pdds[i]->dev->id == dev_id) {
266                                 pdd = p->pdds[i];
267                                 break;
268                         }
269                 }
270
271                 if (!pdd)
272                         return -ENODEV;
273
274                 data = (struct kfd_hsa_memory_exception_data *)
275                                                 pdd->vm_fault_exc_data;
276
277                 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
278                 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
279                 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
280         }
281
282         if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
283                 /*
284                  * block should only happen after the debugger receives runtime
285                  * enable notice.
286                  */
287                 up(&p->runtime_enable_sema);
288                 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
289         }
290
291         if (error_reason)
292                 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
293
294         return 0;
295 }
296
297 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
298 {
299         struct mqd_update_info minfo = {0};
300         int err;
301
302         if (!q)
303                 return 0;
304
305         if (!kfd_dbg_has_cwsr_workaround(q->device))
306                 return 0;
307
308         if (enable && q->properties.is_user_cu_masked)
309                 return -EBUSY;
310
311         minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
312
313         q->properties.is_dbg_wa = enable;
314         err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
315         if (err)
316                 q->properties.is_dbg_wa = false;
317
318         return err;
319 }
320
321 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
322 {
323         struct process_queue_manager *pqm = &target->pqm;
324         struct process_queue_node *pqn;
325         int r = 0;
326
327         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
328                 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
329                 if (enable && r)
330                         goto unwind;
331         }
332
333         return 0;
334
335 unwind:
336         list_for_each_entry(pqn, &pqm->queues, process_queue_list)
337                 kfd_dbg_set_queue_workaround(pqn->q, false);
338
339         if (enable)
340                 target->runtime_info.runtime_state = r == -EBUSY ?
341                                 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
342                                 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
343
344         return r;
345 }
346
347 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
348 {
349         uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
350         uint32_t flags = pdd->process->dbg_flags;
351
352         if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
353                 return 0;
354
355         return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
356                                                 pdd->watch_points, flags, sq_trap_en);
357 }
358
359 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
360 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
361 {
362         int i;
363
364         *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
365
366         spin_lock(&pdd->dev->kfd->watch_points_lock);
367
368         for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
369                 /* device watchpoint in use so skip */
370                 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
371                         continue;
372
373                 pdd->alloc_watch_ids |= 0x1 << i;
374                 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
375                 *watch_id = i;
376                 spin_unlock(&pdd->dev->kfd->watch_points_lock);
377                 return 0;
378         }
379
380         spin_unlock(&pdd->dev->kfd->watch_points_lock);
381
382         return -ENOMEM;
383 }
384
385 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
386 {
387         spin_lock(&pdd->dev->kfd->watch_points_lock);
388
389         /* process owns device watch point so safe to clear */
390         if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
391                 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
392                 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
393         }
394
395         spin_unlock(&pdd->dev->kfd->watch_points_lock);
396 }
397
398 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
399 {
400         bool owns_watch_id = false;
401
402         spin_lock(&pdd->dev->kfd->watch_points_lock);
403         owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
404                         ((pdd->alloc_watch_ids >> watch_id) & 0x1);
405
406         spin_unlock(&pdd->dev->kfd->watch_points_lock);
407
408         return owns_watch_id;
409 }
410
411 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
412                                         uint32_t watch_id)
413 {
414         int r;
415
416         if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
417                 return -EINVAL;
418
419         if (!pdd->dev->kfd->shared_resources.enable_mes) {
420                 r = debug_lock_and_unmap(pdd->dev->dqm);
421                 if (r)
422                         return r;
423         }
424
425         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
426         pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
427                                                         pdd->dev->adev,
428                                                         watch_id);
429         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
430
431         if (!pdd->dev->kfd->shared_resources.enable_mes)
432                 r = debug_map_and_unlock(pdd->dev->dqm);
433         else
434                 r = kfd_dbg_set_mes_debug_mode(pdd, true);
435
436         kfd_dbg_clear_dev_watch_id(pdd, watch_id);
437
438         return r;
439 }
440
441 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
442                                         uint64_t watch_address,
443                                         uint32_t watch_address_mask,
444                                         uint32_t *watch_id,
445                                         uint32_t watch_mode)
446 {
447         int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
448         uint32_t xcc_mask = pdd->dev->xcc_mask;
449
450         if (r)
451                 return r;
452
453         if (!pdd->dev->kfd->shared_resources.enable_mes) {
454                 r = debug_lock_and_unmap(pdd->dev->dqm);
455                 if (r) {
456                         kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
457                         return r;
458                 }
459         }
460
461         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
462         for_each_inst(xcc_id, xcc_mask)
463                 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
464                                 pdd->dev->adev,
465                                 watch_address,
466                                 watch_address_mask,
467                                 *watch_id,
468                                 watch_mode,
469                                 pdd->dev->vm_info.last_vmid_kfd,
470                                 xcc_id);
471         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
472
473         if (!pdd->dev->kfd->shared_resources.enable_mes)
474                 r = debug_map_and_unlock(pdd->dev->dqm);
475         else
476                 r = kfd_dbg_set_mes_debug_mode(pdd, true);
477
478         /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
479         if (r)
480                 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
481
482         return 0;
483 }
484
485 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
486 {
487         int i, j;
488
489         for (i = 0; i < target->n_pdds; i++)
490                 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
491                         kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
492 }
493
494 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
495 {
496         uint32_t prev_flags = target->dbg_flags;
497         int i, r = 0, rewind_count = 0;
498
499         for (i = 0; i < target->n_pdds; i++) {
500                 if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
501                         (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
502                         *flags = prev_flags;
503                         return -EACCES;
504                 }
505         }
506
507         target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
508         *flags = prev_flags;
509         for (i = 0; i < target->n_pdds; i++) {
510                 struct kfd_process_device *pdd = target->pdds[i];
511
512                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
513                         continue;
514
515                 if (!pdd->dev->kfd->shared_resources.enable_mes)
516                         r = debug_refresh_runlist(pdd->dev->dqm);
517                 else
518                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
519
520                 if (r) {
521                         target->dbg_flags = prev_flags;
522                         break;
523                 }
524
525                 rewind_count++;
526         }
527
528         /* Rewind flags */
529         if (r) {
530                 target->dbg_flags = prev_flags;
531
532                 for (i = 0; i < rewind_count; i++) {
533                         struct kfd_process_device *pdd = target->pdds[i];
534
535                         if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
536                                 continue;
537
538                         if (!pdd->dev->kfd->shared_resources.enable_mes)
539                                 debug_refresh_runlist(pdd->dev->dqm);
540                         else
541                                 kfd_dbg_set_mes_debug_mode(pdd, true);
542                 }
543         }
544
545         return r;
546 }
547
548 /* kfd_dbg_trap_deactivate:
549  *      target: target process
550  *      unwind: If this is unwinding a failed kfd_dbg_trap_enable()
551  *      unwind_count:
552  *              If unwind == true, how far down the pdd list we need
553  *                              to unwind
554  *              else: ignored
555  */
556 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
557 {
558         int i;
559
560         if (!unwind) {
561                 uint32_t flags = 0;
562                 int resume_count = resume_queues(target, 0, NULL);
563
564                 if (resume_count)
565                         pr_debug("Resumed %d queues\n", resume_count);
566
567                 cancel_work_sync(&target->debug_event_workarea);
568                 kfd_dbg_clear_process_address_watch(target);
569                 kfd_dbg_trap_set_wave_launch_mode(target, 0);
570
571                 kfd_dbg_trap_set_flags(target, &flags);
572         }
573
574         for (i = 0; i < target->n_pdds; i++) {
575                 struct kfd_process_device *pdd = target->pdds[i];
576
577                 /* If this is an unwind, and we have unwound the required
578                  * enable calls on the pdd list, we need to stop now
579                  * otherwise we may mess up another debugger session.
580                  */
581                 if (unwind && i == unwind_count)
582                         break;
583
584                 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
585
586                 /* GFX off is already disabled by debug activate if not RLC restore supported. */
587                 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
588                         amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
589                 pdd->spi_dbg_override =
590                                 pdd->dev->kfd2kgd->disable_debug_trap(
591                                 pdd->dev->adev,
592                                 target->runtime_info.ttmp_setup,
593                                 pdd->dev->vm_info.last_vmid_kfd);
594                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
595
596                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
597                                 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
598                         pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
599
600                 if (!pdd->dev->kfd->shared_resources.enable_mes)
601                         debug_refresh_runlist(pdd->dev->dqm);
602                 else
603                         kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
604         }
605
606         kfd_dbg_set_workaround(target, false);
607 }
608
609 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
610 {
611         struct process_queue_manager *pqm;
612         struct process_queue_node *pqn;
613         int i;
614
615         for (i = 0; i < target->n_pdds; i++) {
616                 struct kfd_process_device *pdd = target->pdds[i];
617
618                 kfd_process_drain_interrupts(pdd);
619
620                 pdd->exception_status = 0;
621         }
622
623         pqm = &target->pqm;
624         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
625                 if (!pqn->q)
626                         continue;
627
628                 pqn->q->properties.exception_status = 0;
629         }
630
631         target->exception_status = 0;
632 }
633
634 int kfd_dbg_trap_disable(struct kfd_process *target)
635 {
636         if (!target->debug_trap_enabled)
637                 return 0;
638
639         /*
640          * Defer deactivation to runtime if runtime not enabled otherwise reset
641          * attached running target runtime state to enable for re-attach.
642          */
643         if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
644                 kfd_dbg_trap_deactivate(target, false, 0);
645         else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
646                 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
647
648         fput(target->dbg_ev_file);
649         target->dbg_ev_file = NULL;
650
651         if (target->debugger_process) {
652                 atomic_dec(&target->debugger_process->debugged_process_count);
653                 target->debugger_process = NULL;
654         }
655
656         target->debug_trap_enabled = false;
657         kfd_dbg_clean_exception_status(target);
658         kfd_unref_process(target);
659
660         return 0;
661 }
662
663 int kfd_dbg_trap_activate(struct kfd_process *target)
664 {
665         int i, r = 0;
666
667         r = kfd_dbg_set_workaround(target, true);
668         if (r)
669                 return r;
670
671         for (i = 0; i < target->n_pdds; i++) {
672                 struct kfd_process_device *pdd = target->pdds[i];
673
674                 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
675                         r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
676
677                         if (r) {
678                                 target->runtime_info.runtime_state = (r == -EBUSY) ?
679                                                         DEBUG_RUNTIME_STATE_ENABLED_BUSY :
680                                                         DEBUG_RUNTIME_STATE_ENABLED_ERROR;
681
682                                 goto unwind_err;
683                         }
684                 }
685
686                 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
687                  * If RLC restore of debug registers is not supported and runtime enable
688                  * hasn't done so already on ttmp setup request, restore the trap config registers.
689                  *
690                  * If RLC restore of debug registers is not supported, keep gfx off disabled for
691                  * the debug session.
692                  */
693                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
694                 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
695                                                 target->runtime_info.ttmp_setup))
696                         pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
697                                                                 pdd->dev->vm_info.last_vmid_kfd);
698
699                 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
700                                         pdd->dev->adev,
701                                         false,
702                                         pdd->dev->vm_info.last_vmid_kfd);
703
704                 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
705                         amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
706
707                 /*
708                  * Setting the debug flag in the trap handler requires that the TMA has been
709                  * allocated, which occurs during CWSR initialization.
710                  * In the event that CWSR has not been initialized at this point, setting the
711                  * flag will be called again during CWSR initialization if the target process
712                  * is still debug enabled.
713                  */
714                 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
715
716                 if (!pdd->dev->kfd->shared_resources.enable_mes)
717                         r = debug_refresh_runlist(pdd->dev->dqm);
718                 else
719                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
720
721                 if (r) {
722                         target->runtime_info.runtime_state =
723                                         DEBUG_RUNTIME_STATE_ENABLED_ERROR;
724                         goto unwind_err;
725                 }
726         }
727
728         return 0;
729
730 unwind_err:
731         /* Enabling debug failed, we need to disable on
732          * all GPUs so the enable is all or nothing.
733          */
734         kfd_dbg_trap_deactivate(target, true, i);
735         return r;
736 }
737
738 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
739                         void __user *runtime_info, uint32_t *runtime_size)
740 {
741         struct file *f;
742         uint32_t copy_size;
743         int i, r = 0;
744
745         if (target->debug_trap_enabled)
746                 return -EALREADY;
747
748         /* Enable pre-checks */
749         for (i = 0; i < target->n_pdds; i++) {
750                 struct kfd_process_device *pdd = target->pdds[i];
751
752                 if (!KFD_IS_SOC15(pdd->dev))
753                         return -ENODEV;
754
755                 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
756                                          kfd_dbg_has_cwsr_workaround(pdd->dev)))
757                         return -EBUSY;
758         }
759
760         copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
761
762         f = fget(fd);
763         if (!f) {
764                 pr_err("Failed to get file for (%i)\n", fd);
765                 return -EBADF;
766         }
767
768         target->dbg_ev_file = f;
769
770         /* defer activation to runtime if not runtime enabled */
771         if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
772                 kfd_dbg_trap_activate(target);
773
774         /* We already hold the process reference but hold another one for the
775          * debug session.
776          */
777         kref_get(&target->ref);
778         target->debug_trap_enabled = true;
779
780         if (target->debugger_process)
781                 atomic_inc(&target->debugger_process->debugged_process_count);
782
783         if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
784                 kfd_dbg_trap_deactivate(target, false, 0);
785                 r = -EFAULT;
786         }
787
788         *runtime_size = sizeof(target->runtime_info);
789
790         return r;
791 }
792
793 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
794                                                 uint32_t trap_override,
795                                                 uint32_t trap_mask_request,
796                                                 uint32_t *trap_mask_supported)
797 {
798         int i = 0;
799
800         *trap_mask_supported = 0xffffffff;
801
802         for (i = 0; i < p->n_pdds; i++) {
803                 struct kfd_process_device *pdd = p->pdds[i];
804                 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
805                                                                 pdd->dev->adev,
806                                                                 trap_override,
807                                                                 trap_mask_supported);
808
809                 if (err)
810                         return err;
811         }
812
813         if (trap_mask_request & ~*trap_mask_supported)
814                 return -EACCES;
815
816         return 0;
817 }
818
819 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
820                                         uint32_t trap_override,
821                                         uint32_t trap_mask_bits,
822                                         uint32_t trap_mask_request,
823                                         uint32_t *trap_mask_prev,
824                                         uint32_t *trap_mask_supported)
825 {
826         int r = 0, i;
827
828         r = kfd_dbg_validate_trap_override_request(target,
829                                                 trap_override,
830                                                 trap_mask_request,
831                                                 trap_mask_supported);
832
833         if (r)
834                 return r;
835
836         for (i = 0; i < target->n_pdds; i++) {
837                 struct kfd_process_device *pdd = target->pdds[i];
838
839                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
840                 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
841                                 pdd->dev->adev,
842                                 pdd->dev->vm_info.last_vmid_kfd,
843                                 trap_override,
844                                 trap_mask_bits,
845                                 trap_mask_request,
846                                 trap_mask_prev,
847                                 pdd->spi_dbg_override);
848                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
849
850                 if (!pdd->dev->kfd->shared_resources.enable_mes)
851                         r = debug_refresh_runlist(pdd->dev->dqm);
852                 else
853                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
854
855                 if (r)
856                         break;
857         }
858
859         return r;
860 }
861
862 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
863                                         uint8_t wave_launch_mode)
864 {
865         int r = 0, i;
866
867         if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
868                         wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
869                         wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
870                 return -EINVAL;
871
872         for (i = 0; i < target->n_pdds; i++) {
873                 struct kfd_process_device *pdd = target->pdds[i];
874
875                 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
876                 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
877                                 pdd->dev->adev,
878                                 wave_launch_mode,
879                                 pdd->dev->vm_info.last_vmid_kfd);
880                 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
881
882                 if (!pdd->dev->kfd->shared_resources.enable_mes)
883                         r = debug_refresh_runlist(pdd->dev->dqm);
884                 else
885                         r = kfd_dbg_set_mes_debug_mode(pdd, true);
886
887                 if (r)
888                         break;
889         }
890
891         return r;
892 }
893
894 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
895                 uint32_t source_id,
896                 uint32_t exception_code,
897                 bool clear_exception,
898                 void __user *info,
899                 uint32_t *info_size)
900 {
901         bool found = false;
902         int r = 0;
903         uint32_t copy_size, actual_info_size = 0;
904         uint64_t *exception_status_ptr = NULL;
905
906         if (!target)
907                 return -EINVAL;
908
909         if (!info || !info_size)
910                 return -EINVAL;
911
912         mutex_lock(&target->event_mutex);
913
914         if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
915                 /* Per queue exceptions */
916                 struct queue *queue = NULL;
917                 int i;
918
919                 for (i = 0; i < target->n_pdds; i++) {
920                         struct kfd_process_device *pdd = target->pdds[i];
921                         struct qcm_process_device *qpd = &pdd->qpd;
922
923                         list_for_each_entry(queue, &qpd->queues_list, list) {
924                                 if (!found && queue->properties.queue_id == source_id) {
925                                         found = true;
926                                         break;
927                                 }
928                         }
929                         if (found)
930                                 break;
931                 }
932
933                 if (!found) {
934                         r = -EINVAL;
935                         goto out;
936                 }
937
938                 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
939                         r = -ENODATA;
940                         goto out;
941                 }
942                 exception_status_ptr = &queue->properties.exception_status;
943         } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
944                 /* Per device exceptions */
945                 struct kfd_process_device *pdd = NULL;
946                 int i;
947
948                 for (i = 0; i < target->n_pdds; i++) {
949                         pdd = target->pdds[i];
950                         if (pdd->dev->id == source_id) {
951                                 found = true;
952                                 break;
953                         }
954                 }
955
956                 if (!found) {
957                         r = -EINVAL;
958                         goto out;
959                 }
960
961                 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
962                         r = -ENODATA;
963                         goto out;
964                 }
965
966                 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
967                         copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
968
969                         if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
970                                 r = -EFAULT;
971                                 goto out;
972                         }
973                         actual_info_size = pdd->vm_fault_exc_data_size;
974                         if (clear_exception) {
975                                 kfree(pdd->vm_fault_exc_data);
976                                 pdd->vm_fault_exc_data = NULL;
977                                 pdd->vm_fault_exc_data_size = 0;
978                         }
979                 }
980                 exception_status_ptr = &pdd->exception_status;
981         } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
982                 /* Per process exceptions */
983                 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
984                         r = -ENODATA;
985                         goto out;
986                 }
987
988                 if (exception_code == EC_PROCESS_RUNTIME) {
989                         copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
990
991                         if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
992                                 r = -EFAULT;
993                                 goto out;
994                         }
995
996                         actual_info_size = sizeof(target->runtime_info);
997                 }
998
999                 exception_status_ptr = &target->exception_status;
1000         } else {
1001                 pr_debug("Bad exception type [%i]\n", exception_code);
1002                 r = -EINVAL;
1003                 goto out;
1004         }
1005
1006         *info_size = actual_info_size;
1007         if (clear_exception)
1008                 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1009 out:
1010         mutex_unlock(&target->event_mutex);
1011         return r;
1012 }
1013
1014 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1015                 uint64_t exception_clear_mask,
1016                 void __user *user_info,
1017                 uint32_t *number_of_device_infos,
1018                 uint32_t *entry_size)
1019 {
1020         struct kfd_dbg_device_info_entry device_info;
1021         uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1022         int i, r = 0;
1023
1024         if (!(target && user_info && number_of_device_infos && entry_size))
1025                 return -EINVAL;
1026
1027         tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1028         *number_of_device_infos = target->n_pdds;
1029         *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1030
1031         if (!tmp_num_devices)
1032                 return 0;
1033
1034         memset(&device_info, 0, sizeof(device_info));
1035
1036         mutex_lock(&target->event_mutex);
1037
1038         /* Run over all pdd of the process */
1039         for (i = 0; i < tmp_num_devices; i++) {
1040                 struct kfd_process_device *pdd = target->pdds[i];
1041                 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1042
1043                 device_info.gpu_id = pdd->dev->id;
1044                 device_info.exception_status = pdd->exception_status;
1045                 device_info.lds_base = pdd->lds_base;
1046                 device_info.lds_limit = pdd->lds_limit;
1047                 device_info.scratch_base = pdd->scratch_base;
1048                 device_info.scratch_limit = pdd->scratch_limit;
1049                 device_info.gpuvm_base = pdd->gpuvm_base;
1050                 device_info.gpuvm_limit = pdd->gpuvm_limit;
1051                 device_info.location_id = topo_dev->node_props.location_id;
1052                 device_info.vendor_id = topo_dev->node_props.vendor_id;
1053                 device_info.device_id = topo_dev->node_props.device_id;
1054                 device_info.revision_id = pdd->dev->adev->pdev->revision;
1055                 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1056                 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1057                 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1058                 device_info.gfx_target_version =
1059                         topo_dev->node_props.gfx_target_version;
1060                 device_info.simd_count = topo_dev->node_props.simd_count;
1061                 device_info.max_waves_per_simd =
1062                         topo_dev->node_props.max_waves_per_simd;
1063                 device_info.array_count = topo_dev->node_props.array_count;
1064                 device_info.simd_arrays_per_engine =
1065                         topo_dev->node_props.simd_arrays_per_engine;
1066                 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1067                 device_info.capability = topo_dev->node_props.capability;
1068                 device_info.debug_prop = topo_dev->node_props.debug_prop;
1069
1070                 if (exception_clear_mask)
1071                         pdd->exception_status &= ~exception_clear_mask;
1072
1073                 if (copy_to_user(user_info, &device_info, *entry_size)) {
1074                         r = -EFAULT;
1075                         break;
1076                 }
1077
1078                 user_info += tmp_entry_size;
1079         }
1080
1081         mutex_unlock(&target->event_mutex);
1082
1083         return r;
1084 }
1085
1086 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1087                                         uint64_t exception_set_mask)
1088 {
1089         uint64_t found_mask = 0;
1090         struct process_queue_manager *pqm;
1091         struct process_queue_node *pqn;
1092         static const char write_data = '.';
1093         loff_t pos = 0;
1094         int i;
1095
1096         mutex_lock(&target->event_mutex);
1097
1098         found_mask |= target->exception_status;
1099
1100         pqm = &target->pqm;
1101         list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1102                 if (!pqn->q)
1103                         continue;
1104
1105                 found_mask |= pqn->q->properties.exception_status;
1106         }
1107
1108         for (i = 0; i < target->n_pdds; i++) {
1109                 struct kfd_process_device *pdd = target->pdds[i];
1110
1111                 found_mask |= pdd->exception_status;
1112         }
1113
1114         if (exception_set_mask & found_mask)
1115                 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1116
1117         target->exception_enable_mask = exception_set_mask;
1118
1119         mutex_unlock(&target->event_mutex);
1120 }
This page took 0.101074 seconds and 4 git commands to generate.