]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Merge topic branches 'clkdev' and 'fixes' into for-linus
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_dev_coredump.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2024 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24
25 #include <generated/utsrelease.h>
26 #include <linux/devcoredump.h>
27 #include "amdgpu_dev_coredump.h"
28 #include "atom.h"
29
30 #ifndef CONFIG_DEV_COREDUMP
31 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
32                      struct amdgpu_reset_context *reset_context)
33 {
34 }
35 #else
36
37 const char *hw_ip_names[MAX_HWIP] = {
38         [GC_HWIP]               = "GC",
39         [HDP_HWIP]              = "HDP",
40         [SDMA0_HWIP]            = "SDMA0",
41         [SDMA1_HWIP]            = "SDMA1",
42         [SDMA2_HWIP]            = "SDMA2",
43         [SDMA3_HWIP]            = "SDMA3",
44         [SDMA4_HWIP]            = "SDMA4",
45         [SDMA5_HWIP]            = "SDMA5",
46         [SDMA6_HWIP]            = "SDMA6",
47         [SDMA7_HWIP]            = "SDMA7",
48         [LSDMA_HWIP]            = "LSDMA",
49         [MMHUB_HWIP]            = "MMHUB",
50         [ATHUB_HWIP]            = "ATHUB",
51         [NBIO_HWIP]             = "NBIO",
52         [MP0_HWIP]              = "MP0",
53         [MP1_HWIP]              = "MP1",
54         [UVD_HWIP]              = "UVD/JPEG/VCN",
55         [VCN1_HWIP]             = "VCN1",
56         [VCE_HWIP]              = "VCE",
57         [VPE_HWIP]              = "VPE",
58         [DF_HWIP]               = "DF",
59         [DCE_HWIP]              = "DCE",
60         [OSSSYS_HWIP]           = "OSSSYS",
61         [SMUIO_HWIP]            = "SMUIO",
62         [PWR_HWIP]              = "PWR",
63         [NBIF_HWIP]             = "NBIF",
64         [THM_HWIP]              = "THM",
65         [CLK_HWIP]              = "CLK",
66         [UMC_HWIP]              = "UMC",
67         [RSMU_HWIP]             = "RSMU",
68         [XGMI_HWIP]             = "XGMI",
69         [DCI_HWIP]              = "DCI",
70         [PCIE_HWIP]             = "PCIE",
71 };
72
73 static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
74                                        struct drm_printer *p)
75 {
76         uint32_t version;
77         uint32_t feature;
78         uint8_t smu_program, smu_major, smu_minor, smu_debug;
79         struct atom_context *ctx = adev->mode_info.atom_context;
80
81         drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
82                    adev->vce.fb_version, adev->vce.fw_version);
83         drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
84                    adev->uvd.fw_version);
85         drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
86                    adev->gmc.fw_version);
87         drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
88                    adev->gfx.me_feature_version, adev->gfx.me_fw_version);
89         drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
90                    adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
91         drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
92                    adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
93         drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
94                    adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
95
96         drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
97                    adev->gfx.rlc_srlc_feature_version,
98                    adev->gfx.rlc_srlc_fw_version);
99         drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
100                    adev->gfx.rlc_srlg_feature_version,
101                    adev->gfx.rlc_srlg_fw_version);
102         drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
103                    adev->gfx.rlc_srls_feature_version,
104                    adev->gfx.rlc_srls_fw_version);
105         drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
106                    adev->gfx.rlcp_ucode_feature_version,
107                    adev->gfx.rlcp_ucode_version);
108         drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
109                    adev->gfx.rlcv_ucode_feature_version,
110                    adev->gfx.rlcv_ucode_version);
111         drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
112                    adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
113
114         if (adev->gfx.mec2_fw)
115                 drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
116                            adev->gfx.mec2_feature_version,
117                            adev->gfx.mec2_fw_version);
118
119         drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
120                    adev->gfx.imu_fw_version);
121         drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
122                    adev->psp.sos.feature_version, adev->psp.sos.fw_version);
123         drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
124                    adev->psp.asd_context.bin_desc.feature_version,
125                    adev->psp.asd_context.bin_desc.fw_version);
126
127         drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
128                    adev->psp.xgmi_context.context.bin_desc.feature_version,
129                    adev->psp.xgmi_context.context.bin_desc.fw_version);
130         drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
131                    adev->psp.ras_context.context.bin_desc.feature_version,
132                    adev->psp.ras_context.context.bin_desc.fw_version);
133         drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
134                    adev->psp.hdcp_context.context.bin_desc.feature_version,
135                    adev->psp.hdcp_context.context.bin_desc.fw_version);
136         drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
137                    adev->psp.dtm_context.context.bin_desc.feature_version,
138                    adev->psp.dtm_context.context.bin_desc.fw_version);
139         drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
140                    adev->psp.rap_context.context.bin_desc.feature_version,
141                    adev->psp.rap_context.context.bin_desc.fw_version);
142         drm_printf(p,
143                    "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
144                    adev->psp.securedisplay_context.context.bin_desc.feature_version,
145                    adev->psp.securedisplay_context.context.bin_desc.fw_version);
146
147         /* SMC firmware */
148         version = adev->pm.fw_version;
149
150         smu_program = (version >> 24) & 0xff;
151         smu_major = (version >> 16) & 0xff;
152         smu_minor = (version >> 8) & 0xff;
153         smu_debug = (version >> 0) & 0xff;
154         drm_printf(p,
155                    "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
156                    0, smu_program, version, smu_major, smu_minor, smu_debug);
157
158         /* SDMA firmware */
159         for (int i = 0; i < adev->sdma.num_instances; i++) {
160                 drm_printf(p,
161                            "SDMA%d feature version: %u, firmware version: 0x%08x\n",
162                            i, adev->sdma.instance[i].feature_version,
163                            adev->sdma.instance[i].fw_version);
164         }
165
166         drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
167                    adev->vcn.fw_version);
168         drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
169                    adev->dm.dmcu_fw_version);
170         drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
171                    adev->dm.dmcub_fw_version);
172         drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
173                    adev->psp.toc.feature_version, adev->psp.toc.fw_version);
174
175         version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
176         feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
177                   AMDGPU_MES_FEAT_VERSION_SHIFT;
178         drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
179                    feature, version);
180
181         version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
182         feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
183                   AMDGPU_MES_FEAT_VERSION_SHIFT;
184         drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
185                    version);
186
187         drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
188                    adev->vpe.feature_version, adev->vpe.fw_version);
189
190         drm_printf(p, "\nVBIOS Information\n");
191         drm_printf(p, "vbios name       : %s\n", ctx->name);
192         drm_printf(p, "vbios pn         : %s\n", ctx->vbios_pn);
193         drm_printf(p, "vbios version    : %d\n", ctx->version);
194         drm_printf(p, "vbios ver_str    : %s\n", ctx->vbios_ver_str);
195         drm_printf(p, "vbios date       : %s\n", ctx->date);
196 }
197
198 static ssize_t
199 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
200                         void *data, size_t datalen)
201 {
202         struct drm_printer p;
203         struct amdgpu_coredump_info *coredump = data;
204         struct drm_print_iterator iter;
205         struct amdgpu_vm_fault_info *fault_info;
206         int i, ver;
207
208         iter.data = buffer;
209         iter.offset = 0;
210         iter.start = offset;
211         iter.remain = count;
212
213         p = drm_coredump_printer(&iter);
214
215         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
216         drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
217         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
218         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
219         drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
220                    coredump->reset_time.tv_nsec);
221
222         if (coredump->reset_task_info.pid)
223                 drm_printf(&p, "process_name: %s PID: %d\n",
224                            coredump->reset_task_info.process_name,
225                            coredump->reset_task_info.pid);
226
227         /* GPU IP's information of the SOC */
228         drm_printf(&p, "\nIP Information\n");
229         drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
230         drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
231         drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
232
233         for (int i = 1; i < MAX_HWIP; i++) {
234                 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
235                         ver = coredump->adev->ip_versions[i][j];
236                         if (ver)
237                                 drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
238                                            hw_ip_names[i], i, j,
239                                            IP_VERSION_MAJ(ver),
240                                            IP_VERSION_MIN(ver),
241                                            IP_VERSION_REV(ver),
242                                            IP_VERSION_VARIANT(ver),
243                                            IP_VERSION_SUBREV(ver));
244                 }
245         }
246
247         /* IP firmware information */
248         drm_printf(&p, "\nIP Firmwares\n");
249         amdgpu_devcoredump_fw_info(coredump->adev, &p);
250
251         if (coredump->ring) {
252                 drm_printf(&p, "\nRing timed out details\n");
253                 drm_printf(&p, "IP Type: %d Ring Name: %s\n",
254                            coredump->ring->funcs->type,
255                            coredump->ring->name);
256         }
257
258         /* Add page fault information */
259         fault_info = &coredump->adev->vm_manager.fault_info;
260         drm_printf(&p, "\n[%s] Page fault observed\n",
261                    fault_info->vmhub ? "mmhub" : "gfxhub");
262         drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
263         drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
264
265         /* dump the ip state for each ip */
266         drm_printf(&p, "IP Dump\n");
267         for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
268                 if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
269                         drm_printf(&p, "IP: %s\n",
270                                    coredump->adev->ip_blocks[i]
271                                            .version->funcs->name);
272                         coredump->adev->ip_blocks[i]
273                                 .version->funcs->print_ip_state(
274                                         (void *)coredump->adev, &p);
275                         drm_printf(&p, "\n");
276                 }
277         }
278
279         /* Add ring buffer information */
280         drm_printf(&p, "Ring buffer information\n");
281         for (int i = 0; i < coredump->adev->num_rings; i++) {
282                 int j = 0;
283                 struct amdgpu_ring *ring = coredump->adev->rings[i];
284
285                 drm_printf(&p, "ring name: %s\n", ring->name);
286                 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
287                            amdgpu_ring_get_rptr(ring),
288                            amdgpu_ring_get_wptr(ring),
289                            ring->buf_mask);
290                 drm_printf(&p, "Ring size in dwords: %d\n",
291                            ring->ring_size / 4);
292                 drm_printf(&p, "Ring contents\n");
293                 drm_printf(&p, "Offset \t Value\n");
294
295                 while (j < ring->ring_size) {
296                         drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
297                         j += 4;
298                 }
299         }
300
301         if (coredump->reset_vram_lost)
302                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
303         if (coredump->adev->reset_info.num_regs) {
304                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
305
306                 for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
307                         drm_printf(&p, "0x%08x: 0x%08x\n",
308                                    coredump->adev->reset_info.reset_dump_reg_list[i],
309                                    coredump->adev->reset_info.reset_dump_reg_value[i]);
310         }
311
312         return count - iter.remain;
313 }
314
315 static void amdgpu_devcoredump_free(void *data)
316 {
317         kfree(data);
318 }
319
320 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
321                      struct amdgpu_reset_context *reset_context)
322 {
323         struct amdgpu_coredump_info *coredump;
324         struct drm_device *dev = adev_to_drm(adev);
325         struct amdgpu_job *job = reset_context->job;
326         struct drm_sched_job *s_job;
327
328         coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
329
330         if (!coredump) {
331                 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
332                 return;
333         }
334
335         coredump->reset_vram_lost = vram_lost;
336
337         if (reset_context->job && reset_context->job->vm) {
338                 struct amdgpu_task_info *ti;
339                 struct amdgpu_vm *vm = reset_context->job->vm;
340
341                 ti = amdgpu_vm_get_task_info_vm(vm);
342                 if (ti) {
343                         coredump->reset_task_info = *ti;
344                         amdgpu_vm_put_task_info(ti);
345                 }
346         }
347
348         if (job) {
349                 s_job = &job->base;
350                 coredump->ring = to_amdgpu_ring(s_job->sched);
351         }
352
353         coredump->adev = adev;
354
355         ktime_get_ts64(&coredump->reset_time);
356
357         dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
358                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
359 }
360 #endif
This page took 0.056593 seconds and 4 git commands to generate.