2 * Copyright 2023 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 #include <uapi/linux/kfd_sysfs.h>
30 #define MAX_WATCH_ADDRESSES 4
32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33 unsigned int *queue_id,
35 uint64_t exception_clear_mask,
36 uint64_t *event_status)
38 struct process_queue_manager *pqm;
39 struct process_queue_node *pqn;
42 if (!(process && process->debug_trap_enabled))
45 mutex_lock(&process->event_mutex);
50 /* find and report queue events */
52 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53 uint64_t tmp = process->exception_enable_mask;
58 tmp &= pqn->q->properties.exception_status;
63 *event_status = pqn->q->properties.exception_status;
64 *queue_id = pqn->q->properties.queue_id;
65 *gpu_id = pqn->q->device->id;
66 pqn->q->properties.exception_status &= ~exception_clear_mask;
70 /* find and report device events */
71 for (i = 0; i < process->n_pdds; i++) {
72 struct kfd_process_device *pdd = process->pdds[i];
73 uint64_t tmp = process->exception_enable_mask
74 & pdd->exception_status;
79 *event_status = pdd->exception_status;
80 *gpu_id = pdd->dev->id;
81 pdd->exception_status &= ~exception_clear_mask;
85 /* report process events */
86 if (process->exception_enable_mask & process->exception_status) {
87 *event_status = process->exception_status;
88 process->exception_status &= ~exception_clear_mask;
92 mutex_unlock(&process->event_mutex);
93 return *event_status ? 0 : -EAGAIN;
96 void debug_event_write_work_handler(struct work_struct *work)
98 struct kfd_process *process;
100 static const char write_data = '.';
103 process = container_of(work,
105 debug_event_workarea);
107 if (process->debug_trap_enabled && process->dbg_ev_file)
108 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
111 /* update process/device/queue exception status, write to descriptor
112 * only if exception_status is enabled.
114 bool kfd_dbg_ev_raise(uint64_t event_mask,
115 struct kfd_process *process, struct kfd_node *dev,
116 unsigned int source_id, bool use_worker,
117 void *exception_data, size_t exception_data_size)
119 struct process_queue_manager *pqm;
120 struct process_queue_node *pqn;
122 static const char write_data = '.';
124 bool is_subscribed = true;
126 if (!(process && process->debug_trap_enabled))
129 mutex_lock(&process->event_mutex);
131 if (event_mask & KFD_EC_MASK_DEVICE) {
132 for (i = 0; i < process->n_pdds; i++) {
133 struct kfd_process_device *pdd = process->pdds[i];
138 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
140 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141 if (!pdd->vm_fault_exc_data) {
142 pdd->vm_fault_exc_data = kmemdup(
146 if (!pdd->vm_fault_exc_data)
147 pr_debug("Failed to allocate exception data memory");
149 pr_debug("Debugger exception data not saved\n");
150 print_hex_dump_bytes("exception data: ",
153 exception_data_size);
158 } else if (event_mask & KFD_EC_MASK_PROCESS) {
159 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
162 list_for_each_entry(pqn, &pqm->queues,
163 process_queue_list) {
169 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170 pqn->q->properties.queue_id :
173 if (pqn->q->device != dev || target_id != source_id)
176 pqn->q->properties.exception_status |= event_mask;
181 if (process->exception_enable_mask & event_mask) {
183 schedule_work(&process->debug_event_workarea);
185 kernel_write(process->dbg_ev_file,
190 is_subscribed = false;
193 mutex_unlock(&process->event_mutex);
195 return is_subscribed;
198 /* set pending event queue entry from ring entry */
199 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
201 uint32_t doorbell_id,
203 void *exception_data,
204 size_t exception_data_size)
206 struct kfd_process *p;
207 bool signaled_to_debugger_or_runtime = false;
209 p = kfd_lookup_process_by_pasid(pasid);
214 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
215 exception_data, exception_data_size)) {
216 struct process_queue_manager *pqm;
217 struct process_queue_node *pqn;
219 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
220 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
221 mutex_lock(&p->mutex);
224 list_for_each_entry(pqn, &pqm->queues,
225 process_queue_list) {
227 if (!(pqn->q && pqn->q->device == dev &&
228 pqn->q->doorbell_id == doorbell_id))
231 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
234 signaled_to_debugger_or_runtime = true;
239 mutex_unlock(&p->mutex);
240 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
241 kfd_dqm_evict_pasid(dev->dqm, p->pasid);
242 kfd_signal_vm_fault_event(dev, p->pasid, NULL,
245 signaled_to_debugger_or_runtime = true;
248 signaled_to_debugger_or_runtime = true;
251 kfd_unref_process(p);
253 return signaled_to_debugger_or_runtime;
256 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
258 unsigned int queue_id,
259 uint64_t error_reason)
261 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262 struct kfd_process_device *pdd = NULL;
263 struct kfd_hsa_memory_exception_data *data;
266 for (i = 0; i < p->n_pdds; i++) {
267 if (p->pdds[i]->dev->id == dev_id) {
276 data = (struct kfd_hsa_memory_exception_data *)
277 pdd->vm_fault_exc_data;
279 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
280 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
281 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
284 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
286 * block should only happen after the debugger receives runtime
289 up(&p->runtime_enable_sema);
290 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
294 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
299 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
301 struct mqd_update_info minfo = {0};
307 if (!kfd_dbg_has_cwsr_workaround(q->device))
310 if (enable && q->properties.is_user_cu_masked)
313 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
315 q->properties.is_dbg_wa = enable;
316 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
318 q->properties.is_dbg_wa = false;
323 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
325 struct process_queue_manager *pqm = &target->pqm;
326 struct process_queue_node *pqn;
329 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
338 list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339 kfd_dbg_set_queue_workaround(pqn->q, false);
342 target->runtime_info.runtime_state = r == -EBUSY ?
343 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
349 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
351 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352 uint32_t flags = pdd->process->dbg_flags;
354 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
357 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
358 pdd->watch_points, flags, sq_trap_en);
361 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
362 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
366 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
368 spin_lock(&pdd->dev->watch_points_lock);
370 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
371 /* device watchpoint in use so skip */
372 if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
375 pdd->alloc_watch_ids |= 0x1 << i;
376 pdd->dev->alloc_watch_ids |= 0x1 << i;
378 spin_unlock(&pdd->dev->watch_points_lock);
382 spin_unlock(&pdd->dev->watch_points_lock);
387 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
389 spin_lock(&pdd->dev->watch_points_lock);
391 /* process owns device watch point so safe to clear */
392 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
393 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
394 pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
397 spin_unlock(&pdd->dev->watch_points_lock);
400 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
402 bool owns_watch_id = false;
404 spin_lock(&pdd->dev->watch_points_lock);
405 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
406 ((pdd->alloc_watch_ids >> watch_id) & 0x1);
408 spin_unlock(&pdd->dev->watch_points_lock);
410 return owns_watch_id;
413 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
418 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
421 if (!pdd->dev->kfd->shared_resources.enable_mes) {
422 r = debug_lock_and_unmap(pdd->dev->dqm);
427 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
428 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
431 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
433 if (!pdd->dev->kfd->shared_resources.enable_mes)
434 r = debug_map_and_unlock(pdd->dev->dqm);
436 r = kfd_dbg_set_mes_debug_mode(pdd, true);
438 kfd_dbg_clear_dev_watch_id(pdd, watch_id);
443 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
444 uint64_t watch_address,
445 uint32_t watch_address_mask,
449 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
450 uint32_t xcc_mask = pdd->dev->xcc_mask;
455 if (!pdd->dev->kfd->shared_resources.enable_mes) {
456 r = debug_lock_and_unmap(pdd->dev->dqm);
458 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
463 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
464 for_each_inst(xcc_id, xcc_mask)
465 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
471 pdd->dev->vm_info.last_vmid_kfd,
473 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
475 if (!pdd->dev->kfd->shared_resources.enable_mes)
476 r = debug_map_and_unlock(pdd->dev->dqm);
478 r = kfd_dbg_set_mes_debug_mode(pdd, true);
480 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
482 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
487 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
491 for (i = 0; i < target->n_pdds; i++)
492 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
493 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
496 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
498 uint32_t prev_flags = target->dbg_flags;
499 int i, r = 0, rewind_count = 0;
501 for (i = 0; i < target->n_pdds; i++) {
502 struct kfd_topology_device *topo_dev =
503 kfd_topology_device_by_id(target->pdds[i]->dev->id);
504 uint32_t caps = topo_dev->node_props.capability;
506 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
507 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
512 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
513 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
519 target->dbg_flags = *flags;
521 for (i = 0; i < target->n_pdds; i++) {
522 struct kfd_process_device *pdd = target->pdds[i];
524 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
527 if (!pdd->dev->kfd->shared_resources.enable_mes)
528 r = debug_refresh_runlist(pdd->dev->dqm);
530 r = kfd_dbg_set_mes_debug_mode(pdd, true);
533 target->dbg_flags = prev_flags;
542 target->dbg_flags = prev_flags;
544 for (i = 0; i < rewind_count; i++) {
545 struct kfd_process_device *pdd = target->pdds[i];
547 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
550 if (!pdd->dev->kfd->shared_resources.enable_mes)
551 debug_refresh_runlist(pdd->dev->dqm);
553 kfd_dbg_set_mes_debug_mode(pdd, true);
560 /* kfd_dbg_trap_deactivate:
561 * target: target process
562 * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
564 * If unwind == true, how far down the pdd list we need
568 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
574 int resume_count = resume_queues(target, 0, NULL);
577 pr_debug("Resumed %d queues\n", resume_count);
579 cancel_work_sync(&target->debug_event_workarea);
580 kfd_dbg_clear_process_address_watch(target);
581 kfd_dbg_trap_set_wave_launch_mode(target, 0);
583 kfd_dbg_trap_set_flags(target, &flags);
586 for (i = 0; i < target->n_pdds; i++) {
587 struct kfd_process_device *pdd = target->pdds[i];
589 /* If this is an unwind, and we have unwound the required
590 * enable calls on the pdd list, we need to stop now
591 * otherwise we may mess up another debugger session.
593 if (unwind && i == unwind_count)
596 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
598 /* GFX off is already disabled by debug activate if not RLC restore supported. */
599 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
600 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
601 pdd->spi_dbg_override =
602 pdd->dev->kfd2kgd->disable_debug_trap(
604 target->runtime_info.ttmp_setup,
605 pdd->dev->vm_info.last_vmid_kfd);
606 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
608 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
609 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
610 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
612 if (!pdd->dev->kfd->shared_resources.enable_mes)
613 debug_refresh_runlist(pdd->dev->dqm);
615 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
618 kfd_dbg_set_workaround(target, false);
621 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
623 struct process_queue_manager *pqm;
624 struct process_queue_node *pqn;
627 for (i = 0; i < target->n_pdds; i++) {
628 struct kfd_process_device *pdd = target->pdds[i];
630 kfd_process_drain_interrupts(pdd);
632 pdd->exception_status = 0;
636 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
640 pqn->q->properties.exception_status = 0;
643 target->exception_status = 0;
646 int kfd_dbg_trap_disable(struct kfd_process *target)
648 if (!target->debug_trap_enabled)
652 * Defer deactivation to runtime if runtime not enabled otherwise reset
653 * attached running target runtime state to enable for re-attach.
655 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
656 kfd_dbg_trap_deactivate(target, false, 0);
657 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
658 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
660 cancel_work_sync(&target->debug_event_workarea);
661 fput(target->dbg_ev_file);
662 target->dbg_ev_file = NULL;
664 if (target->debugger_process) {
665 atomic_dec(&target->debugger_process->debugged_process_count);
666 target->debugger_process = NULL;
669 target->debug_trap_enabled = false;
670 kfd_dbg_clean_exception_status(target);
671 kfd_unref_process(target);
676 int kfd_dbg_trap_activate(struct kfd_process *target)
680 r = kfd_dbg_set_workaround(target, true);
684 for (i = 0; i < target->n_pdds; i++) {
685 struct kfd_process_device *pdd = target->pdds[i];
687 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
688 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
691 target->runtime_info.runtime_state = (r == -EBUSY) ?
692 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
693 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
699 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
700 * If RLC restore of debug registers is not supported and runtime enable
701 * hasn't done so already on ttmp setup request, restore the trap config registers.
703 * If RLC restore of debug registers is not supported, keep gfx off disabled for
706 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
707 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
708 target->runtime_info.ttmp_setup))
709 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
710 pdd->dev->vm_info.last_vmid_kfd);
712 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
715 pdd->dev->vm_info.last_vmid_kfd);
717 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
718 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
721 * Setting the debug flag in the trap handler requires that the TMA has been
722 * allocated, which occurs during CWSR initialization.
723 * In the event that CWSR has not been initialized at this point, setting the
724 * flag will be called again during CWSR initialization if the target process
725 * is still debug enabled.
727 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
729 if (!pdd->dev->kfd->shared_resources.enable_mes)
730 r = debug_refresh_runlist(pdd->dev->dqm);
732 r = kfd_dbg_set_mes_debug_mode(pdd, true);
735 target->runtime_info.runtime_state =
736 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
744 /* Enabling debug failed, we need to disable on
745 * all GPUs so the enable is all or nothing.
747 kfd_dbg_trap_deactivate(target, true, i);
751 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
752 void __user *runtime_info, uint32_t *runtime_size)
758 if (target->debug_trap_enabled)
761 /* Enable pre-checks */
762 for (i = 0; i < target->n_pdds; i++) {
763 struct kfd_process_device *pdd = target->pdds[i];
765 if (!KFD_IS_SOC15(pdd->dev))
768 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
769 kfd_dbg_has_cwsr_workaround(pdd->dev)))
773 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
777 pr_err("Failed to get file for (%i)\n", fd);
781 target->dbg_ev_file = f;
783 /* defer activation to runtime if not runtime enabled */
784 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
785 kfd_dbg_trap_activate(target);
787 /* We already hold the process reference but hold another one for the
790 kref_get(&target->ref);
791 target->debug_trap_enabled = true;
793 if (target->debugger_process)
794 atomic_inc(&target->debugger_process->debugged_process_count);
796 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
797 kfd_dbg_trap_deactivate(target, false, 0);
801 *runtime_size = sizeof(target->runtime_info);
806 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
807 uint32_t trap_override,
808 uint32_t trap_mask_request,
809 uint32_t *trap_mask_supported)
813 *trap_mask_supported = 0xffffffff;
815 for (i = 0; i < p->n_pdds; i++) {
816 struct kfd_process_device *pdd = p->pdds[i];
817 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
820 trap_mask_supported);
826 if (trap_mask_request & ~*trap_mask_supported)
832 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
833 uint32_t trap_override,
834 uint32_t trap_mask_bits,
835 uint32_t trap_mask_request,
836 uint32_t *trap_mask_prev,
837 uint32_t *trap_mask_supported)
841 r = kfd_dbg_validate_trap_override_request(target,
844 trap_mask_supported);
849 for (i = 0; i < target->n_pdds; i++) {
850 struct kfd_process_device *pdd = target->pdds[i];
852 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
853 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
855 pdd->dev->vm_info.last_vmid_kfd,
860 pdd->spi_dbg_override);
861 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
863 if (!pdd->dev->kfd->shared_resources.enable_mes)
864 r = debug_refresh_runlist(pdd->dev->dqm);
866 r = kfd_dbg_set_mes_debug_mode(pdd, true);
875 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
876 uint8_t wave_launch_mode)
880 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
881 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
882 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
885 for (i = 0; i < target->n_pdds; i++) {
886 struct kfd_process_device *pdd = target->pdds[i];
888 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
889 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
892 pdd->dev->vm_info.last_vmid_kfd);
893 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
895 if (!pdd->dev->kfd->shared_resources.enable_mes)
896 r = debug_refresh_runlist(pdd->dev->dqm);
898 r = kfd_dbg_set_mes_debug_mode(pdd, true);
907 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
909 uint32_t exception_code,
910 bool clear_exception,
916 uint32_t copy_size, actual_info_size = 0;
917 uint64_t *exception_status_ptr = NULL;
922 if (!info || !info_size)
925 mutex_lock(&target->event_mutex);
927 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
928 /* Per queue exceptions */
929 struct queue *queue = NULL;
932 for (i = 0; i < target->n_pdds; i++) {
933 struct kfd_process_device *pdd = target->pdds[i];
934 struct qcm_process_device *qpd = &pdd->qpd;
936 list_for_each_entry(queue, &qpd->queues_list, list) {
937 if (!found && queue->properties.queue_id == source_id) {
951 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
955 exception_status_ptr = &queue->properties.exception_status;
956 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
957 /* Per device exceptions */
958 struct kfd_process_device *pdd = NULL;
961 for (i = 0; i < target->n_pdds; i++) {
962 pdd = target->pdds[i];
963 if (pdd->dev->id == source_id) {
974 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
979 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
980 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
982 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
986 actual_info_size = pdd->vm_fault_exc_data_size;
987 if (clear_exception) {
988 kfree(pdd->vm_fault_exc_data);
989 pdd->vm_fault_exc_data = NULL;
990 pdd->vm_fault_exc_data_size = 0;
993 exception_status_ptr = &pdd->exception_status;
994 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
995 /* Per process exceptions */
996 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1001 if (exception_code == EC_PROCESS_RUNTIME) {
1002 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1004 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1009 actual_info_size = sizeof(target->runtime_info);
1012 exception_status_ptr = &target->exception_status;
1014 pr_debug("Bad exception type [%i]\n", exception_code);
1019 *info_size = actual_info_size;
1020 if (clear_exception)
1021 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1023 mutex_unlock(&target->event_mutex);
1027 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1028 uint64_t exception_clear_mask,
1029 void __user *user_info,
1030 uint32_t *number_of_device_infos,
1031 uint32_t *entry_size)
1033 struct kfd_dbg_device_info_entry device_info;
1034 uint32_t tmp_entry_size, tmp_num_devices;
1037 if (!(target && user_info && number_of_device_infos && entry_size))
1040 tmp_entry_size = *entry_size;
1042 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1043 *number_of_device_infos = target->n_pdds;
1044 *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1046 if (!tmp_num_devices)
1049 memset(&device_info, 0, sizeof(device_info));
1051 mutex_lock(&target->event_mutex);
1053 /* Run over all pdd of the process */
1054 for (i = 0; i < tmp_num_devices; i++) {
1055 struct kfd_process_device *pdd = target->pdds[i];
1056 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1058 device_info.gpu_id = pdd->dev->id;
1059 device_info.exception_status = pdd->exception_status;
1060 device_info.lds_base = pdd->lds_base;
1061 device_info.lds_limit = pdd->lds_limit;
1062 device_info.scratch_base = pdd->scratch_base;
1063 device_info.scratch_limit = pdd->scratch_limit;
1064 device_info.gpuvm_base = pdd->gpuvm_base;
1065 device_info.gpuvm_limit = pdd->gpuvm_limit;
1066 device_info.location_id = topo_dev->node_props.location_id;
1067 device_info.vendor_id = topo_dev->node_props.vendor_id;
1068 device_info.device_id = topo_dev->node_props.device_id;
1069 device_info.revision_id = pdd->dev->adev->pdev->revision;
1070 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1071 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1072 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1073 device_info.gfx_target_version =
1074 topo_dev->node_props.gfx_target_version;
1075 device_info.simd_count = topo_dev->node_props.simd_count;
1076 device_info.max_waves_per_simd =
1077 topo_dev->node_props.max_waves_per_simd;
1078 device_info.array_count = topo_dev->node_props.array_count;
1079 device_info.simd_arrays_per_engine =
1080 topo_dev->node_props.simd_arrays_per_engine;
1081 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1082 device_info.capability = topo_dev->node_props.capability;
1083 device_info.debug_prop = topo_dev->node_props.debug_prop;
1085 if (exception_clear_mask)
1086 pdd->exception_status &= ~exception_clear_mask;
1088 if (copy_to_user(user_info, &device_info, *entry_size)) {
1093 user_info += tmp_entry_size;
1096 mutex_unlock(&target->event_mutex);
1101 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1102 uint64_t exception_set_mask)
1104 uint64_t found_mask = 0;
1105 struct process_queue_manager *pqm;
1106 struct process_queue_node *pqn;
1107 static const char write_data = '.';
1111 mutex_lock(&target->event_mutex);
1113 found_mask |= target->exception_status;
1116 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1120 found_mask |= pqn->q->properties.exception_status;
1123 for (i = 0; i < target->n_pdds; i++) {
1124 struct kfd_process_device *pdd = target->pdds[i];
1126 found_mask |= pdd->exception_status;
1129 if (exception_set_mask & found_mask)
1130 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1132 target->exception_enable_mask = exception_set_mask;
1134 mutex_unlock(&target->event_mutex);