]> Git Repo - J-linux.git/blob - drivers/gpu/drm/xe/xe_devcoredump.c
Merge tag 'kbuild-v6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy...
[J-linux.git] / drivers / gpu / drm / xe / xe_devcoredump.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2023 Intel Corporation
4  */
5
6 #include "xe_devcoredump.h"
7 #include "xe_devcoredump_types.h"
8
9 #include <linux/devcoredump.h>
10 #include <generated/utsrelease.h>
11
12 #include "xe_device.h"
13 #include "xe_exec_queue.h"
14 #include "xe_force_wake.h"
15 #include "xe_gt.h"
16 #include "xe_guc_ct.h"
17 #include "xe_guc_submit.h"
18 #include "xe_hw_engine.h"
19 #include "xe_sched_job.h"
20 #include "xe_vm.h"
21
22 /**
23  * DOC: Xe device coredump
24  *
25  * Devices overview:
26  * Xe uses dev_coredump infrastructure for exposing the crash errors in a
27  * standardized way.
28  * devcoredump exposes a temporary device under /sys/class/devcoredump/
29  * which is linked with our card device directly.
30  * The core dump can be accessed either from
31  * /sys/class/drm/card<n>/device/devcoredump/ or from
32  * /sys/class/devcoredump/devcd<m> where
33  * /sys/class/devcoredump/devcd<m>/failing_device is a link to
34  * /sys/class/drm/card<n>/device/.
35  *
36  * Snapshot at hang:
37  * The 'data' file is printed with a drm_printer pointer at devcoredump read
38  * time. For this reason, we need to take snapshots from when the hang has
39  * happened, and not only when the user is reading the file. Otherwise the
40  * information is outdated since the resets might have happened in between.
41  *
42  * 'First' failure snapshot:
43  * In general, the first hang is the most critical one since the following hangs
44  * can be a consequence of the initial hang. For this reason we only take the
45  * snapshot of the 'first' failure and ignore subsequent calls of this function,
46  * at least while the coredump device is alive. Dev_coredump has a delayed work
47  * queue that will eventually delete the device and free all the dump
48  * information.
49  */
50
51 #ifdef CONFIG_DEV_COREDUMP
52
53 static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump)
54 {
55         return container_of(coredump, struct xe_device, devcoredump);
56 }
57
58 static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
59 {
60         return &q->gt->uc.guc;
61 }
62
63 static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
64 {
65         struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
66
67         xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
68         if (ss->vm)
69                 xe_vm_snapshot_capture_delayed(ss->vm);
70         xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
71 }
72
73 static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
74                                    size_t count, void *data, size_t datalen)
75 {
76         struct xe_devcoredump *coredump = data;
77         struct xe_device *xe = coredump_to_xe(coredump);
78         struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
79         struct drm_printer p;
80         struct drm_print_iterator iter;
81         struct timespec64 ts;
82         int i;
83
84         /* Our device is gone already... */
85         if (!data || !coredump_to_xe(coredump))
86                 return -ENODEV;
87
88         /* Ensure delayed work is captured before continuing */
89         flush_work(&ss->work);
90
91         iter.data = buffer;
92         iter.offset = 0;
93         iter.start = offset;
94         iter.remain = count;
95
96         p = drm_coredump_printer(&iter);
97
98         drm_printf(&p, "**** Xe Device Coredump ****\n");
99         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
100         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
101
102         ts = ktime_to_timespec64(ss->snapshot_time);
103         drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
104         ts = ktime_to_timespec64(ss->boot_time);
105         drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
106         xe_device_snapshot_print(xe, &p);
107
108         drm_printf(&p, "\n**** GuC CT ****\n");
109         xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
110         xe_guc_exec_queue_snapshot_print(coredump->snapshot.ge, &p);
111
112         drm_printf(&p, "\n**** Job ****\n");
113         xe_sched_job_snapshot_print(coredump->snapshot.job, &p);
114
115         drm_printf(&p, "\n**** HW Engines ****\n");
116         for (i = 0; i < XE_NUM_HW_ENGINES; i++)
117                 if (coredump->snapshot.hwe[i])
118                         xe_hw_engine_snapshot_print(coredump->snapshot.hwe[i],
119                                                     &p);
120         if (coredump->snapshot.vm) {
121                 drm_printf(&p, "\n**** VM state ****\n");
122                 xe_vm_snapshot_print(coredump->snapshot.vm, &p);
123         }
124
125         return count - iter.remain;
126 }
127
128 static void xe_devcoredump_free(void *data)
129 {
130         struct xe_devcoredump *coredump = data;
131         int i;
132
133         /* Our device is gone. Nothing to do... */
134         if (!data || !coredump_to_xe(coredump))
135                 return;
136
137         cancel_work_sync(&coredump->snapshot.work);
138
139         xe_guc_ct_snapshot_free(coredump->snapshot.ct);
140         xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
141         xe_sched_job_snapshot_free(coredump->snapshot.job);
142         for (i = 0; i < XE_NUM_HW_ENGINES; i++)
143                 if (coredump->snapshot.hwe[i])
144                         xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
145         xe_vm_snapshot_free(coredump->snapshot.vm);
146
147         /* To prevent stale data on next snapshot, clear everything */
148         memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
149         coredump->captured = false;
150         drm_info(&coredump_to_xe(coredump)->drm,
151                  "Xe device coredump has been deleted.\n");
152 }
153
154 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
155                                  struct xe_sched_job *job)
156 {
157         struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
158         struct xe_exec_queue *q = job->q;
159         struct xe_guc *guc = exec_queue_to_guc(q);
160         struct xe_hw_engine *hwe;
161         enum xe_hw_engine_id id;
162         u32 adj_logical_mask = q->logical_mask;
163         u32 width_mask = (0x1 << q->width) - 1;
164         int i;
165         bool cookie;
166
167         ss->snapshot_time = ktime_get_real();
168         ss->boot_time = ktime_get_boottime();
169
170         ss->gt = q->gt;
171         INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
172
173         cookie = dma_fence_begin_signalling();
174         for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
175                 if (adj_logical_mask & BIT(i)) {
176                         adj_logical_mask |= width_mask << i;
177                         i += q->width;
178                 } else {
179                         ++i;
180                 }
181         }
182
183         xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
184
185         coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
186         coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
187         coredump->snapshot.job = xe_sched_job_snapshot_capture(job);
188         coredump->snapshot.vm = xe_vm_snapshot_capture(q->vm);
189
190         for_each_hw_engine(hwe, q->gt, id) {
191                 if (hwe->class != q->hwe->class ||
192                     !(BIT(hwe->logical_instance) & adj_logical_mask)) {
193                         coredump->snapshot.hwe[id] = NULL;
194                         continue;
195                 }
196                 coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe);
197         }
198
199         if (ss->vm)
200                 queue_work(system_unbound_wq, &ss->work);
201
202         xe_force_wake_put(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
203         dma_fence_end_signalling(cookie);
204 }
205
206 /**
207  * xe_devcoredump - Take the required snapshots and initialize coredump device.
208  * @job: The faulty xe_sched_job, where the issue was detected.
209  *
210  * This function should be called at the crash time within the serialized
211  * gt_reset. It is skipped if we still have the core dump device available
212  * with the information of the 'first' snapshot.
213  */
214 void xe_devcoredump(struct xe_sched_job *job)
215 {
216         struct xe_device *xe = gt_to_xe(job->q->gt);
217         struct xe_devcoredump *coredump = &xe->devcoredump;
218
219         if (coredump->captured) {
220                 drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
221                 return;
222         }
223
224         coredump->captured = true;
225         devcoredump_snapshot(coredump, job);
226
227         drm_info(&xe->drm, "Xe device coredump has been created\n");
228         drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
229                  xe->drm.primary->index);
230
231         dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
232                       xe_devcoredump_read, xe_devcoredump_free);
233 }
234 #endif
235
This page took 0.040516 seconds and 4 git commands to generate.