]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Merge tag 'arc-5.10-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83
84 #define AMDGPU_RESUME_MS                2000
85
86 const char *amdgpu_asic_name[] = {
87         "TAHITI",
88         "PITCAIRN",
89         "VERDE",
90         "OLAND",
91         "HAINAN",
92         "BONAIRE",
93         "KAVERI",
94         "KABINI",
95         "HAWAII",
96         "MULLINS",
97         "TOPAZ",
98         "TONGA",
99         "FIJI",
100         "CARRIZO",
101         "STONEY",
102         "POLARIS10",
103         "POLARIS11",
104         "POLARIS12",
105         "VEGAM",
106         "VEGA10",
107         "VEGA12",
108         "VEGA20",
109         "RAVEN",
110         "ARCTURUS",
111         "RENOIR",
112         "NAVI10",
113         "NAVI14",
114         "NAVI12",
115         "SIENNA_CICHLID",
116         "NAVY_FLOUNDER",
117         "LAST",
118 };
119
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128
129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130                 struct device_attribute *attr, char *buf)
131 {
132         struct drm_device *ddev = dev_get_drvdata(dev);
133         struct amdgpu_device *adev = drm_to_adev(ddev);
134         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140                 amdgpu_device_get_pcie_replay_count, NULL);
141
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153
154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155                 struct device_attribute *attr, char *buf)
156 {
157         struct drm_device *ddev = dev_get_drvdata(dev);
158         struct amdgpu_device *adev = drm_to_adev(ddev);
159
160         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162
163 static DEVICE_ATTR(product_name, S_IRUGO,
164                 amdgpu_device_get_product_name, NULL);
165
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175
176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177                 struct device_attribute *attr, char *buf)
178 {
179         struct drm_device *ddev = dev_get_drvdata(dev);
180         struct amdgpu_device *adev = drm_to_adev(ddev);
181
182         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184
185 static DEVICE_ATTR(product_number, S_IRUGO,
186                 amdgpu_device_get_product_number, NULL);
187
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197
198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199                 struct device_attribute *attr, char *buf)
200 {
201         struct drm_device *ddev = dev_get_drvdata(dev);
202         struct amdgpu_device *adev = drm_to_adev(ddev);
203
204         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208                 amdgpu_device_get_serial_number, NULL);
209
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220         struct amdgpu_device *adev = drm_to_adev(dev);
221
222         if (adev->flags & AMD_IS_PX)
223                 return true;
224         return false;
225 }
226
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237         struct amdgpu_device *adev = drm_to_adev(dev);
238
239         return amdgpu_asic_supports_baco(adev);
240 }
241
242 /*
243  * VRAM access helper functions
244  */
245
246 /**
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256                                uint32_t *buf, size_t size, bool write)
257 {
258         unsigned long flags;
259         uint32_t hi = ~0;
260         uint64_t last;
261
262
263 #ifdef CONFIG_64BIT
264         last = min(pos + size, adev->gmc.visible_vram_size);
265         if (last > pos) {
266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267                 size_t count = last - pos;
268
269                 if (write) {
270                         memcpy_toio(addr, buf, count);
271                         mb();
272                         amdgpu_asic_flush_hdp(adev, NULL);
273                 } else {
274                         amdgpu_asic_invalidate_hdp(adev, NULL);
275                         mb();
276                         memcpy_fromio(buf, addr, count);
277                 }
278
279                 if (count == size)
280                         return;
281
282                 pos += count;
283                 buf += count / 4;
284                 size -= count;
285         }
286 #endif
287
288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289         for (last = pos + size; pos < last; pos += 4) {
290                 uint32_t tmp = pos >> 31;
291
292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293                 if (tmp != hi) {
294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295                         hi = tmp;
296                 }
297                 if (write)
298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
299                 else
300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
301         }
302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304
305 /*
306  * register access helper functions.
307  */
308 /**
309  * amdgpu_device_rreg - read a memory mapped IO or indirect register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318                             uint32_t reg, uint32_t acc_flags)
319 {
320         uint32_t ret;
321
322         if (adev->in_pci_err_recovery)
323                 return 0;
324
325         if ((reg * 4) < adev->rmmio_size) {
326                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327                     amdgpu_sriov_runtime(adev) &&
328                     down_read_trylock(&adev->reset_sem)) {
329                         ret = amdgpu_kiq_rreg(adev, reg);
330                         up_read(&adev->reset_sem);
331                 } else {
332                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333                 }
334         } else {
335                 ret = adev->pcie_rreg(adev, reg * 4);
336         }
337
338         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339
340         return ret;
341 }
342
343 /*
344  * MMIO register read with bytes helper functions
345  * @offset:bytes offset from MMIO start
346  *
347 */
348
349 /**
350  * amdgpu_mm_rreg8 - read a memory mapped IO register
351  *
352  * @adev: amdgpu_device pointer
353  * @offset: byte aligned register offset
354  *
355  * Returns the 8 bit value from the offset specified.
356  */
357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359         if (adev->in_pci_err_recovery)
360                 return 0;
361
362         if (offset < adev->rmmio_size)
363                 return (readb(adev->rmmio + offset));
364         BUG();
365 }
366
367 /*
368  * MMIO register write with bytes helper functions
369  * @offset:bytes offset from MMIO start
370  * @value: the value want to be written to the register
371  *
372 */
373 /**
374  * amdgpu_mm_wreg8 - read a memory mapped IO register
375  *
376  * @adev: amdgpu_device pointer
377  * @offset: byte aligned register offset
378  * @value: 8 bit value to write
379  *
380  * Writes the value specified to the offset specified.
381  */
382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384         if (adev->in_pci_err_recovery)
385                 return;
386
387         if (offset < adev->rmmio_size)
388                 writeb(value, adev->rmmio + offset);
389         else
390                 BUG();
391 }
392
393 /**
394  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395  *
396  * @adev: amdgpu_device pointer
397  * @reg: dword aligned register offset
398  * @v: 32 bit value to write to the register
399  * @acc_flags: access flags which require special behavior
400  *
401  * Writes the value specified to the offset specified.
402  */
403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404                         uint32_t reg, uint32_t v,
405                         uint32_t acc_flags)
406 {
407         if (adev->in_pci_err_recovery)
408                 return;
409
410         if ((reg * 4) < adev->rmmio_size) {
411                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412                     amdgpu_sriov_runtime(adev) &&
413                     down_read_trylock(&adev->reset_sem)) {
414                         amdgpu_kiq_wreg(adev, reg, v);
415                         up_read(&adev->reset_sem);
416                 } else {
417                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418                 }
419         } else {
420                 adev->pcie_wreg(adev, reg * 4, v);
421         }
422
423         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425
426 /*
427  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
428  *
429  * this function is invoked only the debugfs register access
430  * */
431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432                              uint32_t reg, uint32_t v)
433 {
434         if (adev->in_pci_err_recovery)
435                 return;
436
437         if (amdgpu_sriov_fullaccess(adev) &&
438             adev->gfx.rlc.funcs &&
439             adev->gfx.rlc.funcs->is_rlcg_access_range) {
440                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442         } else {
443                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444         }
445 }
446
447 /**
448  * amdgpu_io_rreg - read an IO register
449  *
450  * @adev: amdgpu_device pointer
451  * @reg: dword aligned register offset
452  *
453  * Returns the 32 bit value from the offset specified.
454  */
455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457         if (adev->in_pci_err_recovery)
458                 return 0;
459
460         if ((reg * 4) < adev->rio_mem_size)
461                 return ioread32(adev->rio_mem + (reg * 4));
462         else {
463                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465         }
466 }
467
468 /**
469  * amdgpu_io_wreg - write to an IO register
470  *
471  * @adev: amdgpu_device pointer
472  * @reg: dword aligned register offset
473  * @v: 32 bit value to write to the register
474  *
475  * Writes the value specified to the offset specified.
476  */
477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479         if (adev->in_pci_err_recovery)
480                 return;
481
482         if ((reg * 4) < adev->rio_mem_size)
483                 iowrite32(v, adev->rio_mem + (reg * 4));
484         else {
485                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487         }
488 }
489
490 /**
491  * amdgpu_mm_rdoorbell - read a doorbell dword
492  *
493  * @adev: amdgpu_device pointer
494  * @index: doorbell index
495  *
496  * Returns the value in the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501         if (adev->in_pci_err_recovery)
502                 return 0;
503
504         if (index < adev->doorbell.num_doorbells) {
505                 return readl(adev->doorbell.ptr + index);
506         } else {
507                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508                 return 0;
509         }
510 }
511
512 /**
513  * amdgpu_mm_wdoorbell - write a doorbell dword
514  *
515  * @adev: amdgpu_device pointer
516  * @index: doorbell index
517  * @v: value to write
518  *
519  * Writes @v to the doorbell aperture at the
520  * requested doorbell index (CIK).
521  */
522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524         if (adev->in_pci_err_recovery)
525                 return;
526
527         if (index < adev->doorbell.num_doorbells) {
528                 writel(v, adev->doorbell.ptr + index);
529         } else {
530                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531         }
532 }
533
534 /**
535  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536  *
537  * @adev: amdgpu_device pointer
538  * @index: doorbell index
539  *
540  * Returns the value in the doorbell aperture at the
541  * requested doorbell index (VEGA10+).
542  */
543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545         if (adev->in_pci_err_recovery)
546                 return 0;
547
548         if (index < adev->doorbell.num_doorbells) {
549                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550         } else {
551                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552                 return 0;
553         }
554 }
555
556 /**
557  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558  *
559  * @adev: amdgpu_device pointer
560  * @index: doorbell index
561  * @v: value to write
562  *
563  * Writes @v to the doorbell aperture at the
564  * requested doorbell index (VEGA10+).
565  */
566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568         if (adev->in_pci_err_recovery)
569                 return;
570
571         if (index < adev->doorbell.num_doorbells) {
572                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573         } else {
574                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575         }
576 }
577
578 /**
579  * amdgpu_device_indirect_rreg - read an indirect register
580  *
581  * @adev: amdgpu_device pointer
582  * @pcie_index: mmio register offset
583  * @pcie_data: mmio register offset
584  *
585  * Returns the value of indirect register @reg_addr
586  */
587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588                                 u32 pcie_index, u32 pcie_data,
589                                 u32 reg_addr)
590 {
591         unsigned long flags;
592         u32 r;
593         void __iomem *pcie_index_offset;
594         void __iomem *pcie_data_offset;
595
596         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599
600         writel(reg_addr, pcie_index_offset);
601         readl(pcie_index_offset);
602         r = readl(pcie_data_offset);
603         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604
605         return r;
606 }
607
608 /**
609  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610  *
611  * @adev: amdgpu_device pointer
612  * @pcie_index: mmio register offset
613  * @pcie_data: mmio register offset
614  *
615  * Returns the value of indirect register @reg_addr
616  */
617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618                                   u32 pcie_index, u32 pcie_data,
619                                   u32 reg_addr)
620 {
621         unsigned long flags;
622         u64 r;
623         void __iomem *pcie_index_offset;
624         void __iomem *pcie_data_offset;
625
626         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629
630         /* read low 32 bits */
631         writel(reg_addr, pcie_index_offset);
632         readl(pcie_index_offset);
633         r = readl(pcie_data_offset);
634         /* read high 32 bits */
635         writel(reg_addr + 4, pcie_index_offset);
636         readl(pcie_index_offset);
637         r |= ((u64)readl(pcie_data_offset) << 32);
638         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639
640         return r;
641 }
642
643 /**
644  * amdgpu_device_indirect_wreg - write an indirect register address
645  *
646  * @adev: amdgpu_device pointer
647  * @pcie_index: mmio register offset
648  * @pcie_data: mmio register offset
649  * @reg_addr: indirect register offset
650  * @reg_data: indirect register data
651  *
652  */
653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654                                  u32 pcie_index, u32 pcie_data,
655                                  u32 reg_addr, u32 reg_data)
656 {
657         unsigned long flags;
658         void __iomem *pcie_index_offset;
659         void __iomem *pcie_data_offset;
660
661         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664
665         writel(reg_addr, pcie_index_offset);
666         readl(pcie_index_offset);
667         writel(reg_data, pcie_data_offset);
668         readl(pcie_data_offset);
669         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671
672 /**
673  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674  *
675  * @adev: amdgpu_device pointer
676  * @pcie_index: mmio register offset
677  * @pcie_data: mmio register offset
678  * @reg_addr: indirect register offset
679  * @reg_data: indirect register data
680  *
681  */
682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683                                    u32 pcie_index, u32 pcie_data,
684                                    u32 reg_addr, u64 reg_data)
685 {
686         unsigned long flags;
687         void __iomem *pcie_index_offset;
688         void __iomem *pcie_data_offset;
689
690         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693
694         /* write low 32 bits */
695         writel(reg_addr, pcie_index_offset);
696         readl(pcie_index_offset);
697         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698         readl(pcie_data_offset);
699         /* write high 32 bits */
700         writel(reg_addr + 4, pcie_index_offset);
701         readl(pcie_index_offset);
702         writel((u32)(reg_data >> 32), pcie_data_offset);
703         readl(pcie_data_offset);
704         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706
707 /**
708  * amdgpu_invalid_rreg - dummy reg read function
709  *
710  * @adev: amdgpu_device pointer
711  * @reg: offset of register
712  *
713  * Dummy register read function.  Used for register blocks
714  * that certain asics don't have (all asics).
715  * Returns the value in the register.
716  */
717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720         BUG();
721         return 0;
722 }
723
724 /**
725  * amdgpu_invalid_wreg - dummy reg write function
726  *
727  * @adev: amdgpu_device pointer
728  * @reg: offset of register
729  * @v: value to write to the register
730  *
731  * Dummy register read function.  Used for register blocks
732  * that certain asics don't have (all asics).
733  */
734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737                   reg, v);
738         BUG();
739 }
740
741 /**
742  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743  *
744  * @adev: amdgpu_device pointer
745  * @reg: offset of register
746  *
747  * Dummy register read function.  Used for register blocks
748  * that certain asics don't have (all asics).
749  * Returns the value in the register.
750  */
751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754         BUG();
755         return 0;
756 }
757
758 /**
759  * amdgpu_invalid_wreg64 - dummy reg write function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  * @v: value to write to the register
764  *
765  * Dummy register read function.  Used for register blocks
766  * that certain asics don't have (all asics).
767  */
768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771                   reg, v);
772         BUG();
773 }
774
775 /**
776  * amdgpu_block_invalid_rreg - dummy reg read function
777  *
778  * @adev: amdgpu_device pointer
779  * @block: offset of instance
780  * @reg: offset of register
781  *
782  * Dummy register read function.  Used for register blocks
783  * that certain asics don't have (all asics).
784  * Returns the value in the register.
785  */
786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787                                           uint32_t block, uint32_t reg)
788 {
789         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790                   reg, block);
791         BUG();
792         return 0;
793 }
794
795 /**
796  * amdgpu_block_invalid_wreg - dummy reg write function
797  *
798  * @adev: amdgpu_device pointer
799  * @block: offset of instance
800  * @reg: offset of register
801  * @v: value to write to the register
802  *
803  * Dummy register read function.  Used for register blocks
804  * that certain asics don't have (all asics).
805  */
806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807                                       uint32_t block,
808                                       uint32_t reg, uint32_t v)
809 {
810         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811                   reg, block, v);
812         BUG();
813 }
814
815 /**
816  * amdgpu_device_asic_init - Wrapper for atom asic_init
817  *
818  * @adev: amdgpu_device pointer
819  *
820  * Does any asic specific work and then calls atom asic init.
821  */
822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824         amdgpu_asic_pre_asic_init(adev);
825
826         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828
829 /**
830  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Allocates a scratch page of VRAM for use by various things in the
835  * driver.
836  */
837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841                                        &adev->vram_scratch.robj,
842                                        &adev->vram_scratch.gpu_addr,
843                                        (void **)&adev->vram_scratch.ptr);
844 }
845
846 /**
847  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848  *
849  * @adev: amdgpu_device pointer
850  *
851  * Frees the VRAM scratch page.
852  */
853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857
858 /**
859  * amdgpu_device_program_register_sequence - program an array of registers.
860  *
861  * @adev: amdgpu_device pointer
862  * @registers: pointer to the register array
863  * @array_size: size of the register array
864  *
865  * Programs an array or registers with and and or masks.
866  * This is a helper for setting golden registers.
867  */
868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869                                              const u32 *registers,
870                                              const u32 array_size)
871 {
872         u32 tmp, reg, and_mask, or_mask;
873         int i;
874
875         if (array_size % 3)
876                 return;
877
878         for (i = 0; i < array_size; i +=3) {
879                 reg = registers[i + 0];
880                 and_mask = registers[i + 1];
881                 or_mask = registers[i + 2];
882
883                 if (and_mask == 0xffffffff) {
884                         tmp = or_mask;
885                 } else {
886                         tmp = RREG32(reg);
887                         tmp &= ~and_mask;
888                         if (adev->family >= AMDGPU_FAMILY_AI)
889                                 tmp |= (or_mask & and_mask);
890                         else
891                                 tmp |= or_mask;
892                 }
893                 WREG32(reg, tmp);
894         }
895 }
896
897 /**
898  * amdgpu_device_pci_config_reset - reset the GPU
899  *
900  * @adev: amdgpu_device pointer
901  *
902  * Resets the GPU using the pci config reset sequence.
903  * Only applicable to asics prior to vega10.
904  */
905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909
910 /*
911  * GPU doorbell aperture helpers function.
912  */
913 /**
914  * amdgpu_device_doorbell_init - Init doorbell driver information.
915  *
916  * @adev: amdgpu_device pointer
917  *
918  * Init doorbell driver information (CIK)
919  * Returns 0 on success, error on failure.
920  */
921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923
924         /* No doorbell on SI hardware generation */
925         if (adev->asic_type < CHIP_BONAIRE) {
926                 adev->doorbell.base = 0;
927                 adev->doorbell.size = 0;
928                 adev->doorbell.num_doorbells = 0;
929                 adev->doorbell.ptr = NULL;
930                 return 0;
931         }
932
933         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934                 return -EINVAL;
935
936         amdgpu_asic_init_doorbell_index(adev);
937
938         /* doorbell bar mapping */
939         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941
942         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943                                              adev->doorbell_index.max_assignment+1);
944         if (adev->doorbell.num_doorbells == 0)
945                 return -EINVAL;
946
947         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
948          * paging queue doorbell use the second page. The
949          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950          * doorbells are in the first page. So with paging queue enabled,
951          * the max num_doorbells should + 1 page (0x400 in dword)
952          */
953         if (adev->asic_type >= CHIP_VEGA10)
954                 adev->doorbell.num_doorbells += 0x400;
955
956         adev->doorbell.ptr = ioremap(adev->doorbell.base,
957                                      adev->doorbell.num_doorbells *
958                                      sizeof(u32));
959         if (adev->doorbell.ptr == NULL)
960                 return -ENOMEM;
961
962         return 0;
963 }
964
965 /**
966  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967  *
968  * @adev: amdgpu_device pointer
969  *
970  * Tear down doorbell driver information (CIK)
971  */
972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974         iounmap(adev->doorbell.ptr);
975         adev->doorbell.ptr = NULL;
976 }
977
978
979
980 /*
981  * amdgpu_device_wb_*()
982  * Writeback is the method by which the GPU updates special pages in memory
983  * with the status of certain GPU events (fences, ring pointers,etc.).
984  */
985
986 /**
987  * amdgpu_device_wb_fini - Disable Writeback and free memory
988  *
989  * @adev: amdgpu_device pointer
990  *
991  * Disables Writeback and frees the Writeback memory (all asics).
992  * Used at driver shutdown.
993  */
994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996         if (adev->wb.wb_obj) {
997                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998                                       &adev->wb.gpu_addr,
999                                       (void **)&adev->wb.wb);
1000                 adev->wb.wb_obj = NULL;
1001         }
1002 }
1003
1004 /**
1005  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Initializes writeback and allocates writeback memory (all asics).
1010  * Used at driver startup.
1011  * Returns 0 on success or an -error on failure.
1012  */
1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015         int r;
1016
1017         if (adev->wb.wb_obj == NULL) {
1018                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022                                             (void **)&adev->wb.wb);
1023                 if (r) {
1024                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025                         return r;
1026                 }
1027
1028                 adev->wb.num_wb = AMDGPU_MAX_WB;
1029                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030
1031                 /* clear wb memory */
1032                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033         }
1034
1035         return 0;
1036 }
1037
1038 /**
1039  * amdgpu_device_wb_get - Allocate a wb entry
1040  *
1041  * @adev: amdgpu_device pointer
1042  * @wb: wb index
1043  *
1044  * Allocate a wb slot for use by the driver (all asics).
1045  * Returns 0 on success or -EINVAL on failure.
1046  */
1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050
1051         if (offset < adev->wb.num_wb) {
1052                 __set_bit(offset, adev->wb.used);
1053                 *wb = offset << 3; /* convert to dw offset */
1054                 return 0;
1055         } else {
1056                 return -EINVAL;
1057         }
1058 }
1059
1060 /**
1061  * amdgpu_device_wb_free - Free a wb entry
1062  *
1063  * @adev: amdgpu_device pointer
1064  * @wb: wb index
1065  *
1066  * Free a wb slot allocated for use by the driver (all asics)
1067  */
1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070         wb >>= 3;
1071         if (wb < adev->wb.num_wb)
1072                 __clear_bit(wb, adev->wb.used);
1073 }
1074
1075 /**
1076  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081  * to fail, but if any of the BARs is not accessible after the size we abort
1082  * driver loading by returning -ENODEV.
1083  */
1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088         struct pci_bus *root;
1089         struct resource *res;
1090         unsigned i;
1091         u16 cmd;
1092         int r;
1093
1094         /* Bypass for VF */
1095         if (amdgpu_sriov_vf(adev))
1096                 return 0;
1097
1098         /* skip if the bios has already enabled large BAR */
1099         if (adev->gmc.real_vram_size &&
1100             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1101                 return 0;
1102
1103         /* Check if the root BUS has 64bit memory resources */
1104         root = adev->pdev->bus;
1105         while (root->parent)
1106                 root = root->parent;
1107
1108         pci_bus_for_each_resource(root, res, i) {
1109                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1110                     res->start > 0x100000000ull)
1111                         break;
1112         }
1113
1114         /* Trying to resize is pointless without a root hub window above 4GB */
1115         if (!res)
1116                 return 0;
1117
1118         /* Disable memory decoding while we change the BAR addresses and size */
1119         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1120         pci_write_config_word(adev->pdev, PCI_COMMAND,
1121                               cmd & ~PCI_COMMAND_MEMORY);
1122
1123         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1124         amdgpu_device_doorbell_fini(adev);
1125         if (adev->asic_type >= CHIP_BONAIRE)
1126                 pci_release_resource(adev->pdev, 2);
1127
1128         pci_release_resource(adev->pdev, 0);
1129
1130         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1131         if (r == -ENOSPC)
1132                 DRM_INFO("Not enough PCI address space for a large BAR.");
1133         else if (r && r != -ENOTSUPP)
1134                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1135
1136         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1137
1138         /* When the doorbell or fb BAR isn't available we have no chance of
1139          * using the device.
1140          */
1141         r = amdgpu_device_doorbell_init(adev);
1142         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1143                 return -ENODEV;
1144
1145         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1146
1147         return 0;
1148 }
1149
1150 /*
1151  * GPU helpers function.
1152  */
1153 /**
1154  * amdgpu_device_need_post - check if the hw need post or not
1155  *
1156  * @adev: amdgpu_device pointer
1157  *
1158  * Check if the asic has been initialized (all asics) at driver startup
1159  * or post is needed if  hw reset is performed.
1160  * Returns true if need or false if not.
1161  */
1162 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1163 {
1164         uint32_t reg;
1165
1166         if (amdgpu_sriov_vf(adev))
1167                 return false;
1168
1169         if (amdgpu_passthrough(adev)) {
1170                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1171                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1172                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1173                  * vpost executed for smc version below 22.15
1174                  */
1175                 if (adev->asic_type == CHIP_FIJI) {
1176                         int err;
1177                         uint32_t fw_ver;
1178                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1179                         /* force vPost if error occured */
1180                         if (err)
1181                                 return true;
1182
1183                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1184                         if (fw_ver < 0x00160e00)
1185                                 return true;
1186                 }
1187         }
1188
1189         if (adev->has_hw_reset) {
1190                 adev->has_hw_reset = false;
1191                 return true;
1192         }
1193
1194         /* bios scratch used on CIK+ */
1195         if (adev->asic_type >= CHIP_BONAIRE)
1196                 return amdgpu_atombios_scratch_need_asic_init(adev);
1197
1198         /* check MEM_SIZE for older asics */
1199         reg = amdgpu_asic_get_config_memsize(adev);
1200
1201         if ((reg != 0) && (reg != 0xffffffff))
1202                 return false;
1203
1204         return true;
1205 }
1206
1207 /* if we get transitioned to only one device, take VGA back */
1208 /**
1209  * amdgpu_device_vga_set_decode - enable/disable vga decode
1210  *
1211  * @cookie: amdgpu_device pointer
1212  * @state: enable/disable vga decode
1213  *
1214  * Enable/disable vga decode (all asics).
1215  * Returns VGA resource flags.
1216  */
1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1218 {
1219         struct amdgpu_device *adev = cookie;
1220         amdgpu_asic_set_vga_state(adev, state);
1221         if (state)
1222                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1223                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224         else
1225                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1226 }
1227
1228 /**
1229  * amdgpu_device_check_block_size - validate the vm block size
1230  *
1231  * @adev: amdgpu_device pointer
1232  *
1233  * Validates the vm block size specified via module parameter.
1234  * The vm block size defines number of bits in page table versus page directory,
1235  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1236  * page table and the remaining bits are in the page directory.
1237  */
1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1239 {
1240         /* defines number of bits in page table versus page directory,
1241          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1242          * page table and the remaining bits are in the page directory */
1243         if (amdgpu_vm_block_size == -1)
1244                 return;
1245
1246         if (amdgpu_vm_block_size < 9) {
1247                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1248                          amdgpu_vm_block_size);
1249                 amdgpu_vm_block_size = -1;
1250         }
1251 }
1252
1253 /**
1254  * amdgpu_device_check_vm_size - validate the vm size
1255  *
1256  * @adev: amdgpu_device pointer
1257  *
1258  * Validates the vm size in GB specified via module parameter.
1259  * The VM size is the size of the GPU virtual memory space in GB.
1260  */
1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1262 {
1263         /* no need to check the default value */
1264         if (amdgpu_vm_size == -1)
1265                 return;
1266
1267         if (amdgpu_vm_size < 1) {
1268                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1269                          amdgpu_vm_size);
1270                 amdgpu_vm_size = -1;
1271         }
1272 }
1273
1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1275 {
1276         struct sysinfo si;
1277         bool is_os_64 = (sizeof(void *) == 8);
1278         uint64_t total_memory;
1279         uint64_t dram_size_seven_GB = 0x1B8000000;
1280         uint64_t dram_size_three_GB = 0xB8000000;
1281
1282         if (amdgpu_smu_memory_pool_size == 0)
1283                 return;
1284
1285         if (!is_os_64) {
1286                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1287                 goto def_value;
1288         }
1289         si_meminfo(&si);
1290         total_memory = (uint64_t)si.totalram * si.mem_unit;
1291
1292         if ((amdgpu_smu_memory_pool_size == 1) ||
1293                 (amdgpu_smu_memory_pool_size == 2)) {
1294                 if (total_memory < dram_size_three_GB)
1295                         goto def_value1;
1296         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1297                 (amdgpu_smu_memory_pool_size == 8)) {
1298                 if (total_memory < dram_size_seven_GB)
1299                         goto def_value1;
1300         } else {
1301                 DRM_WARN("Smu memory pool size not supported\n");
1302                 goto def_value;
1303         }
1304         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1305
1306         return;
1307
1308 def_value1:
1309         DRM_WARN("No enough system memory\n");
1310 def_value:
1311         adev->pm.smu_prv_buffer_size = 0;
1312 }
1313
1314 /**
1315  * amdgpu_device_check_arguments - validate module params
1316  *
1317  * @adev: amdgpu_device pointer
1318  *
1319  * Validates certain module parameters and updates
1320  * the associated values used by the driver (all asics).
1321  */
1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1323 {
1324         if (amdgpu_sched_jobs < 4) {
1325                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1326                          amdgpu_sched_jobs);
1327                 amdgpu_sched_jobs = 4;
1328         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1329                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1330                          amdgpu_sched_jobs);
1331                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1332         }
1333
1334         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1335                 /* gart size must be greater or equal to 32M */
1336                 dev_warn(adev->dev, "gart size (%d) too small\n",
1337                          amdgpu_gart_size);
1338                 amdgpu_gart_size = -1;
1339         }
1340
1341         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1342                 /* gtt size must be greater or equal to 32M */
1343                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1344                                  amdgpu_gtt_size);
1345                 amdgpu_gtt_size = -1;
1346         }
1347
1348         /* valid range is between 4 and 9 inclusive */
1349         if (amdgpu_vm_fragment_size != -1 &&
1350             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1351                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1352                 amdgpu_vm_fragment_size = -1;
1353         }
1354
1355         if (amdgpu_sched_hw_submission < 2) {
1356                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1357                          amdgpu_sched_hw_submission);
1358                 amdgpu_sched_hw_submission = 2;
1359         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1360                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1361                          amdgpu_sched_hw_submission);
1362                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1363         }
1364
1365         amdgpu_device_check_smu_prv_buffer_size(adev);
1366
1367         amdgpu_device_check_vm_size(adev);
1368
1369         amdgpu_device_check_block_size(adev);
1370
1371         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1372
1373         amdgpu_gmc_tmz_set(adev);
1374
1375         if (amdgpu_num_kcq == -1) {
1376                 amdgpu_num_kcq = 8;
1377         } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1378                 amdgpu_num_kcq = 8;
1379                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1380         }
1381
1382         amdgpu_gmc_noretry_set(adev);
1383
1384         return 0;
1385 }
1386
1387 /**
1388  * amdgpu_switcheroo_set_state - set switcheroo state
1389  *
1390  * @pdev: pci dev pointer
1391  * @state: vga_switcheroo state
1392  *
1393  * Callback for the switcheroo driver.  Suspends or resumes the
1394  * the asics before or after it is powered up using ACPI methods.
1395  */
1396 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1397                                         enum vga_switcheroo_state state)
1398 {
1399         struct drm_device *dev = pci_get_drvdata(pdev);
1400         int r;
1401
1402         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1403                 return;
1404
1405         if (state == VGA_SWITCHEROO_ON) {
1406                 pr_info("switched on\n");
1407                 /* don't suspend or resume card normally */
1408                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1409
1410                 pci_set_power_state(dev->pdev, PCI_D0);
1411                 amdgpu_device_load_pci_state(dev->pdev);
1412                 r = pci_enable_device(dev->pdev);
1413                 if (r)
1414                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1415                 amdgpu_device_resume(dev, true);
1416
1417                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1418                 drm_kms_helper_poll_enable(dev);
1419         } else {
1420                 pr_info("switched off\n");
1421                 drm_kms_helper_poll_disable(dev);
1422                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1423                 amdgpu_device_suspend(dev, true);
1424                 amdgpu_device_cache_pci_state(dev->pdev);
1425                 /* Shut down the device */
1426                 pci_disable_device(dev->pdev);
1427                 pci_set_power_state(dev->pdev, PCI_D3cold);
1428                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1429         }
1430 }
1431
1432 /**
1433  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1434  *
1435  * @pdev: pci dev pointer
1436  *
1437  * Callback for the switcheroo driver.  Check of the switcheroo
1438  * state can be changed.
1439  * Returns true if the state can be changed, false if not.
1440  */
1441 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1442 {
1443         struct drm_device *dev = pci_get_drvdata(pdev);
1444
1445         /*
1446         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1447         * locking inversion with the driver load path. And the access here is
1448         * completely racy anyway. So don't bother with locking for now.
1449         */
1450         return atomic_read(&dev->open_count) == 0;
1451 }
1452
1453 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1454         .set_gpu_state = amdgpu_switcheroo_set_state,
1455         .reprobe = NULL,
1456         .can_switch = amdgpu_switcheroo_can_switch,
1457 };
1458
1459 /**
1460  * amdgpu_device_ip_set_clockgating_state - set the CG state
1461  *
1462  * @dev: amdgpu_device pointer
1463  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1464  * @state: clockgating state (gate or ungate)
1465  *
1466  * Sets the requested clockgating state for all instances of
1467  * the hardware IP specified.
1468  * Returns the error code from the last instance.
1469  */
1470 int amdgpu_device_ip_set_clockgating_state(void *dev,
1471                                            enum amd_ip_block_type block_type,
1472                                            enum amd_clockgating_state state)
1473 {
1474         struct amdgpu_device *adev = dev;
1475         int i, r = 0;
1476
1477         for (i = 0; i < adev->num_ip_blocks; i++) {
1478                 if (!adev->ip_blocks[i].status.valid)
1479                         continue;
1480                 if (adev->ip_blocks[i].version->type != block_type)
1481                         continue;
1482                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1483                         continue;
1484                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1485                         (void *)adev, state);
1486                 if (r)
1487                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1488                                   adev->ip_blocks[i].version->funcs->name, r);
1489         }
1490         return r;
1491 }
1492
1493 /**
1494  * amdgpu_device_ip_set_powergating_state - set the PG state
1495  *
1496  * @dev: amdgpu_device pointer
1497  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1498  * @state: powergating state (gate or ungate)
1499  *
1500  * Sets the requested powergating state for all instances of
1501  * the hardware IP specified.
1502  * Returns the error code from the last instance.
1503  */
1504 int amdgpu_device_ip_set_powergating_state(void *dev,
1505                                            enum amd_ip_block_type block_type,
1506                                            enum amd_powergating_state state)
1507 {
1508         struct amdgpu_device *adev = dev;
1509         int i, r = 0;
1510
1511         for (i = 0; i < adev->num_ip_blocks; i++) {
1512                 if (!adev->ip_blocks[i].status.valid)
1513                         continue;
1514                 if (adev->ip_blocks[i].version->type != block_type)
1515                         continue;
1516                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1517                         continue;
1518                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1519                         (void *)adev, state);
1520                 if (r)
1521                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1522                                   adev->ip_blocks[i].version->funcs->name, r);
1523         }
1524         return r;
1525 }
1526
1527 /**
1528  * amdgpu_device_ip_get_clockgating_state - get the CG state
1529  *
1530  * @adev: amdgpu_device pointer
1531  * @flags: clockgating feature flags
1532  *
1533  * Walks the list of IPs on the device and updates the clockgating
1534  * flags for each IP.
1535  * Updates @flags with the feature flags for each hardware IP where
1536  * clockgating is enabled.
1537  */
1538 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1539                                             u32 *flags)
1540 {
1541         int i;
1542
1543         for (i = 0; i < adev->num_ip_blocks; i++) {
1544                 if (!adev->ip_blocks[i].status.valid)
1545                         continue;
1546                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1547                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1548         }
1549 }
1550
1551 /**
1552  * amdgpu_device_ip_wait_for_idle - wait for idle
1553  *
1554  * @adev: amdgpu_device pointer
1555  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1556  *
1557  * Waits for the request hardware IP to be idle.
1558  * Returns 0 for success or a negative error code on failure.
1559  */
1560 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1561                                    enum amd_ip_block_type block_type)
1562 {
1563         int i, r;
1564
1565         for (i = 0; i < adev->num_ip_blocks; i++) {
1566                 if (!adev->ip_blocks[i].status.valid)
1567                         continue;
1568                 if (adev->ip_blocks[i].version->type == block_type) {
1569                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1570                         if (r)
1571                                 return r;
1572                         break;
1573                 }
1574         }
1575         return 0;
1576
1577 }
1578
1579 /**
1580  * amdgpu_device_ip_is_idle - is the hardware IP idle
1581  *
1582  * @adev: amdgpu_device pointer
1583  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1584  *
1585  * Check if the hardware IP is idle or not.
1586  * Returns true if it the IP is idle, false if not.
1587  */
1588 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1589                               enum amd_ip_block_type block_type)
1590 {
1591         int i;
1592
1593         for (i = 0; i < adev->num_ip_blocks; i++) {
1594                 if (!adev->ip_blocks[i].status.valid)
1595                         continue;
1596                 if (adev->ip_blocks[i].version->type == block_type)
1597                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1598         }
1599         return true;
1600
1601 }
1602
1603 /**
1604  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1605  *
1606  * @adev: amdgpu_device pointer
1607  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1608  *
1609  * Returns a pointer to the hardware IP block structure
1610  * if it exists for the asic, otherwise NULL.
1611  */
1612 struct amdgpu_ip_block *
1613 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1614                               enum amd_ip_block_type type)
1615 {
1616         int i;
1617
1618         for (i = 0; i < adev->num_ip_blocks; i++)
1619                 if (adev->ip_blocks[i].version->type == type)
1620                         return &adev->ip_blocks[i];
1621
1622         return NULL;
1623 }
1624
1625 /**
1626  * amdgpu_device_ip_block_version_cmp
1627  *
1628  * @adev: amdgpu_device pointer
1629  * @type: enum amd_ip_block_type
1630  * @major: major version
1631  * @minor: minor version
1632  *
1633  * return 0 if equal or greater
1634  * return 1 if smaller or the ip_block doesn't exist
1635  */
1636 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1637                                        enum amd_ip_block_type type,
1638                                        u32 major, u32 minor)
1639 {
1640         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1641
1642         if (ip_block && ((ip_block->version->major > major) ||
1643                         ((ip_block->version->major == major) &&
1644                         (ip_block->version->minor >= minor))))
1645                 return 0;
1646
1647         return 1;
1648 }
1649
1650 /**
1651  * amdgpu_device_ip_block_add
1652  *
1653  * @adev: amdgpu_device pointer
1654  * @ip_block_version: pointer to the IP to add
1655  *
1656  * Adds the IP block driver information to the collection of IPs
1657  * on the asic.
1658  */
1659 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1660                                const struct amdgpu_ip_block_version *ip_block_version)
1661 {
1662         if (!ip_block_version)
1663                 return -EINVAL;
1664
1665         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1666                   ip_block_version->funcs->name);
1667
1668         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1669
1670         return 0;
1671 }
1672
1673 /**
1674  * amdgpu_device_enable_virtual_display - enable virtual display feature
1675  *
1676  * @adev: amdgpu_device pointer
1677  *
1678  * Enabled the virtual display feature if the user has enabled it via
1679  * the module parameter virtual_display.  This feature provides a virtual
1680  * display hardware on headless boards or in virtualized environments.
1681  * This function parses and validates the configuration string specified by
1682  * the user and configues the virtual display configuration (number of
1683  * virtual connectors, crtcs, etc.) specified.
1684  */
1685 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1686 {
1687         adev->enable_virtual_display = false;
1688
1689         if (amdgpu_virtual_display) {
1690                 struct drm_device *ddev = adev_to_drm(adev);
1691                 const char *pci_address_name = pci_name(ddev->pdev);
1692                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1693
1694                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1695                 pciaddstr_tmp = pciaddstr;
1696                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1697                         pciaddname = strsep(&pciaddname_tmp, ",");
1698                         if (!strcmp("all", pciaddname)
1699                             || !strcmp(pci_address_name, pciaddname)) {
1700                                 long num_crtc;
1701                                 int res = -1;
1702
1703                                 adev->enable_virtual_display = true;
1704
1705                                 if (pciaddname_tmp)
1706                                         res = kstrtol(pciaddname_tmp, 10,
1707                                                       &num_crtc);
1708
1709                                 if (!res) {
1710                                         if (num_crtc < 1)
1711                                                 num_crtc = 1;
1712                                         if (num_crtc > 6)
1713                                                 num_crtc = 6;
1714                                         adev->mode_info.num_crtc = num_crtc;
1715                                 } else {
1716                                         adev->mode_info.num_crtc = 1;
1717                                 }
1718                                 break;
1719                         }
1720                 }
1721
1722                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1723                          amdgpu_virtual_display, pci_address_name,
1724                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1725
1726                 kfree(pciaddstr);
1727         }
1728 }
1729
1730 /**
1731  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1732  *
1733  * @adev: amdgpu_device pointer
1734  *
1735  * Parses the asic configuration parameters specified in the gpu info
1736  * firmware and makes them availale to the driver for use in configuring
1737  * the asic.
1738  * Returns 0 on success, -EINVAL on failure.
1739  */
1740 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1741 {
1742         const char *chip_name;
1743         char fw_name[40];
1744         int err;
1745         const struct gpu_info_firmware_header_v1_0 *hdr;
1746
1747         adev->firmware.gpu_info_fw = NULL;
1748
1749         if (adev->mman.discovery_bin) {
1750                 amdgpu_discovery_get_gfx_info(adev);
1751
1752                 /*
1753                  * FIXME: The bounding box is still needed by Navi12, so
1754                  * temporarily read it from gpu_info firmware. Should be droped
1755                  * when DAL no longer needs it.
1756                  */
1757                 if (adev->asic_type != CHIP_NAVI12)
1758                         return 0;
1759         }
1760
1761         switch (adev->asic_type) {
1762 #ifdef CONFIG_DRM_AMDGPU_SI
1763         case CHIP_VERDE:
1764         case CHIP_TAHITI:
1765         case CHIP_PITCAIRN:
1766         case CHIP_OLAND:
1767         case CHIP_HAINAN:
1768 #endif
1769 #ifdef CONFIG_DRM_AMDGPU_CIK
1770         case CHIP_BONAIRE:
1771         case CHIP_HAWAII:
1772         case CHIP_KAVERI:
1773         case CHIP_KABINI:
1774         case CHIP_MULLINS:
1775 #endif
1776         case CHIP_TOPAZ:
1777         case CHIP_TONGA:
1778         case CHIP_FIJI:
1779         case CHIP_POLARIS10:
1780         case CHIP_POLARIS11:
1781         case CHIP_POLARIS12:
1782         case CHIP_VEGAM:
1783         case CHIP_CARRIZO:
1784         case CHIP_STONEY:
1785         case CHIP_VEGA20:
1786         case CHIP_SIENNA_CICHLID:
1787         case CHIP_NAVY_FLOUNDER:
1788         default:
1789                 return 0;
1790         case CHIP_VEGA10:
1791                 chip_name = "vega10";
1792                 break;
1793         case CHIP_VEGA12:
1794                 chip_name = "vega12";
1795                 break;
1796         case CHIP_RAVEN:
1797                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1798                         chip_name = "raven2";
1799                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1800                         chip_name = "picasso";
1801                 else
1802                         chip_name = "raven";
1803                 break;
1804         case CHIP_ARCTURUS:
1805                 chip_name = "arcturus";
1806                 break;
1807         case CHIP_RENOIR:
1808                 chip_name = "renoir";
1809                 break;
1810         case CHIP_NAVI10:
1811                 chip_name = "navi10";
1812                 break;
1813         case CHIP_NAVI14:
1814                 chip_name = "navi14";
1815                 break;
1816         case CHIP_NAVI12:
1817                 chip_name = "navi12";
1818                 break;
1819         }
1820
1821         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1822         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1823         if (err) {
1824                 dev_err(adev->dev,
1825                         "Failed to load gpu_info firmware \"%s\"\n",
1826                         fw_name);
1827                 goto out;
1828         }
1829         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1830         if (err) {
1831                 dev_err(adev->dev,
1832                         "Failed to validate gpu_info firmware \"%s\"\n",
1833                         fw_name);
1834                 goto out;
1835         }
1836
1837         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1838         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1839
1840         switch (hdr->version_major) {
1841         case 1:
1842         {
1843                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1844                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1845                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1846
1847                 /*
1848                  * Should be droped when DAL no longer needs it.
1849                  */
1850                 if (adev->asic_type == CHIP_NAVI12)
1851                         goto parse_soc_bounding_box;
1852
1853                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1854                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1855                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1856                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1857                 adev->gfx.config.max_texture_channel_caches =
1858                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1859                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1860                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1861                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1862                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1863                 adev->gfx.config.double_offchip_lds_buf =
1864                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1865                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1866                 adev->gfx.cu_info.max_waves_per_simd =
1867                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1868                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1869                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1870                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1871                 if (hdr->version_minor >= 1) {
1872                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1873                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1874                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1875                         adev->gfx.config.num_sc_per_sh =
1876                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1877                         adev->gfx.config.num_packer_per_sc =
1878                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1879                 }
1880
1881 parse_soc_bounding_box:
1882                 /*
1883                  * soc bounding box info is not integrated in disocovery table,
1884                  * we always need to parse it from gpu info firmware if needed.
1885                  */
1886                 if (hdr->version_minor == 2) {
1887                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1888                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1889                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1890                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1891                 }
1892                 break;
1893         }
1894         default:
1895                 dev_err(adev->dev,
1896                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1897                 err = -EINVAL;
1898                 goto out;
1899         }
1900 out:
1901         return err;
1902 }
1903
1904 /**
1905  * amdgpu_device_ip_early_init - run early init for hardware IPs
1906  *
1907  * @adev: amdgpu_device pointer
1908  *
1909  * Early initialization pass for hardware IPs.  The hardware IPs that make
1910  * up each asic are discovered each IP's early_init callback is run.  This
1911  * is the first stage in initializing the asic.
1912  * Returns 0 on success, negative error code on failure.
1913  */
1914 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1915 {
1916         int i, r;
1917
1918         amdgpu_device_enable_virtual_display(adev);
1919
1920         if (amdgpu_sriov_vf(adev)) {
1921                 r = amdgpu_virt_request_full_gpu(adev, true);
1922                 if (r)
1923                         return r;
1924         }
1925
1926         switch (adev->asic_type) {
1927 #ifdef CONFIG_DRM_AMDGPU_SI
1928         case CHIP_VERDE:
1929         case CHIP_TAHITI:
1930         case CHIP_PITCAIRN:
1931         case CHIP_OLAND:
1932         case CHIP_HAINAN:
1933                 adev->family = AMDGPU_FAMILY_SI;
1934                 r = si_set_ip_blocks(adev);
1935                 if (r)
1936                         return r;
1937                 break;
1938 #endif
1939 #ifdef CONFIG_DRM_AMDGPU_CIK
1940         case CHIP_BONAIRE:
1941         case CHIP_HAWAII:
1942         case CHIP_KAVERI:
1943         case CHIP_KABINI:
1944         case CHIP_MULLINS:
1945                 if (adev->flags & AMD_IS_APU)
1946                         adev->family = AMDGPU_FAMILY_KV;
1947                 else
1948                         adev->family = AMDGPU_FAMILY_CI;
1949
1950                 r = cik_set_ip_blocks(adev);
1951                 if (r)
1952                         return r;
1953                 break;
1954 #endif
1955         case CHIP_TOPAZ:
1956         case CHIP_TONGA:
1957         case CHIP_FIJI:
1958         case CHIP_POLARIS10:
1959         case CHIP_POLARIS11:
1960         case CHIP_POLARIS12:
1961         case CHIP_VEGAM:
1962         case CHIP_CARRIZO:
1963         case CHIP_STONEY:
1964                 if (adev->flags & AMD_IS_APU)
1965                         adev->family = AMDGPU_FAMILY_CZ;
1966                 else
1967                         adev->family = AMDGPU_FAMILY_VI;
1968
1969                 r = vi_set_ip_blocks(adev);
1970                 if (r)
1971                         return r;
1972                 break;
1973         case CHIP_VEGA10:
1974         case CHIP_VEGA12:
1975         case CHIP_VEGA20:
1976         case CHIP_RAVEN:
1977         case CHIP_ARCTURUS:
1978         case CHIP_RENOIR:
1979                 if (adev->flags & AMD_IS_APU)
1980                         adev->family = AMDGPU_FAMILY_RV;
1981                 else
1982                         adev->family = AMDGPU_FAMILY_AI;
1983
1984                 r = soc15_set_ip_blocks(adev);
1985                 if (r)
1986                         return r;
1987                 break;
1988         case  CHIP_NAVI10:
1989         case  CHIP_NAVI14:
1990         case  CHIP_NAVI12:
1991         case  CHIP_SIENNA_CICHLID:
1992         case  CHIP_NAVY_FLOUNDER:
1993                 adev->family = AMDGPU_FAMILY_NV;
1994
1995                 r = nv_set_ip_blocks(adev);
1996                 if (r)
1997                         return r;
1998                 break;
1999         default:
2000                 /* FIXME: not supported yet */
2001                 return -EINVAL;
2002         }
2003
2004         amdgpu_amdkfd_device_probe(adev);
2005
2006         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2007         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2008                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2009
2010         for (i = 0; i < adev->num_ip_blocks; i++) {
2011                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2012                         DRM_ERROR("disabled ip block: %d <%s>\n",
2013                                   i, adev->ip_blocks[i].version->funcs->name);
2014                         adev->ip_blocks[i].status.valid = false;
2015                 } else {
2016                         if (adev->ip_blocks[i].version->funcs->early_init) {
2017                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2018                                 if (r == -ENOENT) {
2019                                         adev->ip_blocks[i].status.valid = false;
2020                                 } else if (r) {
2021                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2022                                                   adev->ip_blocks[i].version->funcs->name, r);
2023                                         return r;
2024                                 } else {
2025                                         adev->ip_blocks[i].status.valid = true;
2026                                 }
2027                         } else {
2028                                 adev->ip_blocks[i].status.valid = true;
2029                         }
2030                 }
2031                 /* get the vbios after the asic_funcs are set up */
2032                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2033                         r = amdgpu_device_parse_gpu_info_fw(adev);
2034                         if (r)
2035                                 return r;
2036
2037                         /* Read BIOS */
2038                         if (!amdgpu_get_bios(adev))
2039                                 return -EINVAL;
2040
2041                         r = amdgpu_atombios_init(adev);
2042                         if (r) {
2043                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2044                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2045                                 return r;
2046                         }
2047                 }
2048         }
2049
2050         adev->cg_flags &= amdgpu_cg_mask;
2051         adev->pg_flags &= amdgpu_pg_mask;
2052
2053         return 0;
2054 }
2055
2056 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2057 {
2058         int i, r;
2059
2060         for (i = 0; i < adev->num_ip_blocks; i++) {
2061                 if (!adev->ip_blocks[i].status.sw)
2062                         continue;
2063                 if (adev->ip_blocks[i].status.hw)
2064                         continue;
2065                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2066                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2067                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2068                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2069                         if (r) {
2070                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2071                                           adev->ip_blocks[i].version->funcs->name, r);
2072                                 return r;
2073                         }
2074                         adev->ip_blocks[i].status.hw = true;
2075                 }
2076         }
2077
2078         return 0;
2079 }
2080
2081 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2082 {
2083         int i, r;
2084
2085         for (i = 0; i < adev->num_ip_blocks; i++) {
2086                 if (!adev->ip_blocks[i].status.sw)
2087                         continue;
2088                 if (adev->ip_blocks[i].status.hw)
2089                         continue;
2090                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2091                 if (r) {
2092                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2093                                   adev->ip_blocks[i].version->funcs->name, r);
2094                         return r;
2095                 }
2096                 adev->ip_blocks[i].status.hw = true;
2097         }
2098
2099         return 0;
2100 }
2101
2102 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2103 {
2104         int r = 0;
2105         int i;
2106         uint32_t smu_version;
2107
2108         if (adev->asic_type >= CHIP_VEGA10) {
2109                 for (i = 0; i < adev->num_ip_blocks; i++) {
2110                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2111                                 continue;
2112
2113                         /* no need to do the fw loading again if already done*/
2114                         if (adev->ip_blocks[i].status.hw == true)
2115                                 break;
2116
2117                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2118                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2119                                 if (r) {
2120                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2121                                                           adev->ip_blocks[i].version->funcs->name, r);
2122                                         return r;
2123                                 }
2124                         } else {
2125                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2126                                 if (r) {
2127                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2128                                                           adev->ip_blocks[i].version->funcs->name, r);
2129                                         return r;
2130                                 }
2131                         }
2132
2133                         adev->ip_blocks[i].status.hw = true;
2134                         break;
2135                 }
2136         }
2137
2138         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2139                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2140
2141         return r;
2142 }
2143
2144 /**
2145  * amdgpu_device_ip_init - run init for hardware IPs
2146  *
2147  * @adev: amdgpu_device pointer
2148  *
2149  * Main initialization pass for hardware IPs.  The list of all the hardware
2150  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2151  * are run.  sw_init initializes the software state associated with each IP
2152  * and hw_init initializes the hardware associated with each IP.
2153  * Returns 0 on success, negative error code on failure.
2154  */
2155 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2156 {
2157         int i, r;
2158
2159         r = amdgpu_ras_init(adev);
2160         if (r)
2161                 return r;
2162
2163         for (i = 0; i < adev->num_ip_blocks; i++) {
2164                 if (!adev->ip_blocks[i].status.valid)
2165                         continue;
2166                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2167                 if (r) {
2168                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2169                                   adev->ip_blocks[i].version->funcs->name, r);
2170                         goto init_failed;
2171                 }
2172                 adev->ip_blocks[i].status.sw = true;
2173
2174                 /* need to do gmc hw init early so we can allocate gpu mem */
2175                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2176                         r = amdgpu_device_vram_scratch_init(adev);
2177                         if (r) {
2178                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2179                                 goto init_failed;
2180                         }
2181                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2182                         if (r) {
2183                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2184                                 goto init_failed;
2185                         }
2186                         r = amdgpu_device_wb_init(adev);
2187                         if (r) {
2188                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2189                                 goto init_failed;
2190                         }
2191                         adev->ip_blocks[i].status.hw = true;
2192
2193                         /* right after GMC hw init, we create CSA */
2194                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2195                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2196                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2197                                                                 AMDGPU_CSA_SIZE);
2198                                 if (r) {
2199                                         DRM_ERROR("allocate CSA failed %d\n", r);
2200                                         goto init_failed;
2201                                 }
2202                         }
2203                 }
2204         }
2205
2206         if (amdgpu_sriov_vf(adev))
2207                 amdgpu_virt_init_data_exchange(adev);
2208
2209         r = amdgpu_ib_pool_init(adev);
2210         if (r) {
2211                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2212                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2213                 goto init_failed;
2214         }
2215
2216         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2217         if (r)
2218                 goto init_failed;
2219
2220         r = amdgpu_device_ip_hw_init_phase1(adev);
2221         if (r)
2222                 goto init_failed;
2223
2224         r = amdgpu_device_fw_loading(adev);
2225         if (r)
2226                 goto init_failed;
2227
2228         r = amdgpu_device_ip_hw_init_phase2(adev);
2229         if (r)
2230                 goto init_failed;
2231
2232         /*
2233          * retired pages will be loaded from eeprom and reserved here,
2234          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2235          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2236          * for I2C communication which only true at this point.
2237          *
2238          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2239          * failure from bad gpu situation and stop amdgpu init process
2240          * accordingly. For other failed cases, it will still release all
2241          * the resource and print error message, rather than returning one
2242          * negative value to upper level.
2243          *
2244          * Note: theoretically, this should be called before all vram allocations
2245          * to protect retired page from abusing
2246          */
2247         r = amdgpu_ras_recovery_init(adev);
2248         if (r)
2249                 goto init_failed;
2250
2251         if (adev->gmc.xgmi.num_physical_nodes > 1)
2252                 amdgpu_xgmi_add_device(adev);
2253         amdgpu_amdkfd_device_init(adev);
2254
2255         amdgpu_fru_get_product_info(adev);
2256
2257 init_failed:
2258         if (amdgpu_sriov_vf(adev))
2259                 amdgpu_virt_release_full_gpu(adev, true);
2260
2261         return r;
2262 }
2263
2264 /**
2265  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2266  *
2267  * @adev: amdgpu_device pointer
2268  *
2269  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2270  * this function before a GPU reset.  If the value is retained after a
2271  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2272  */
2273 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2274 {
2275         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2276 }
2277
2278 /**
2279  * amdgpu_device_check_vram_lost - check if vram is valid
2280  *
2281  * @adev: amdgpu_device pointer
2282  *
2283  * Checks the reset magic value written to the gart pointer in VRAM.
2284  * The driver calls this after a GPU reset to see if the contents of
2285  * VRAM is lost or now.
2286  * returns true if vram is lost, false if not.
2287  */
2288 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2289 {
2290         if (memcmp(adev->gart.ptr, adev->reset_magic,
2291                         AMDGPU_RESET_MAGIC_NUM))
2292                 return true;
2293
2294         if (!amdgpu_in_reset(adev))
2295                 return false;
2296
2297         /*
2298          * For all ASICs with baco/mode1 reset, the VRAM is
2299          * always assumed to be lost.
2300          */
2301         switch (amdgpu_asic_reset_method(adev)) {
2302         case AMD_RESET_METHOD_BACO:
2303         case AMD_RESET_METHOD_MODE1:
2304                 return true;
2305         default:
2306                 return false;
2307         }
2308 }
2309
2310 /**
2311  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2312  *
2313  * @adev: amdgpu_device pointer
2314  * @state: clockgating state (gate or ungate)
2315  *
2316  * The list of all the hardware IPs that make up the asic is walked and the
2317  * set_clockgating_state callbacks are run.
2318  * Late initialization pass enabling clockgating for hardware IPs.
2319  * Fini or suspend, pass disabling clockgating for hardware IPs.
2320  * Returns 0 on success, negative error code on failure.
2321  */
2322
2323 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2324                                                 enum amd_clockgating_state state)
2325 {
2326         int i, j, r;
2327
2328         if (amdgpu_emu_mode == 1)
2329                 return 0;
2330
2331         for (j = 0; j < adev->num_ip_blocks; j++) {
2332                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2333                 if (!adev->ip_blocks[i].status.late_initialized)
2334                         continue;
2335                 /* skip CG for VCE/UVD, it's handled specially */
2336                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2337                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2338                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2339                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2340                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2341                         /* enable clockgating to save power */
2342                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2343                                                                                      state);
2344                         if (r) {
2345                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2346                                           adev->ip_blocks[i].version->funcs->name, r);
2347                                 return r;
2348                         }
2349                 }
2350         }
2351
2352         return 0;
2353 }
2354
2355 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2356 {
2357         int i, j, r;
2358
2359         if (amdgpu_emu_mode == 1)
2360                 return 0;
2361
2362         for (j = 0; j < adev->num_ip_blocks; j++) {
2363                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2364                 if (!adev->ip_blocks[i].status.late_initialized)
2365                         continue;
2366                 /* skip CG for VCE/UVD, it's handled specially */
2367                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2368                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2369                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2370                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2371                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2372                         /* enable powergating to save power */
2373                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2374                                                                                         state);
2375                         if (r) {
2376                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2377                                           adev->ip_blocks[i].version->funcs->name, r);
2378                                 return r;
2379                         }
2380                 }
2381         }
2382         return 0;
2383 }
2384
2385 static int amdgpu_device_enable_mgpu_fan_boost(void)
2386 {
2387         struct amdgpu_gpu_instance *gpu_ins;
2388         struct amdgpu_device *adev;
2389         int i, ret = 0;
2390
2391         mutex_lock(&mgpu_info.mutex);
2392
2393         /*
2394          * MGPU fan boost feature should be enabled
2395          * only when there are two or more dGPUs in
2396          * the system
2397          */
2398         if (mgpu_info.num_dgpu < 2)
2399                 goto out;
2400
2401         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2402                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2403                 adev = gpu_ins->adev;
2404                 if (!(adev->flags & AMD_IS_APU) &&
2405                     !gpu_ins->mgpu_fan_enabled) {
2406                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2407                         if (ret)
2408                                 break;
2409
2410                         gpu_ins->mgpu_fan_enabled = 1;
2411                 }
2412         }
2413
2414 out:
2415         mutex_unlock(&mgpu_info.mutex);
2416
2417         return ret;
2418 }
2419
2420 /**
2421  * amdgpu_device_ip_late_init - run late init for hardware IPs
2422  *
2423  * @adev: amdgpu_device pointer
2424  *
2425  * Late initialization pass for hardware IPs.  The list of all the hardware
2426  * IPs that make up the asic is walked and the late_init callbacks are run.
2427  * late_init covers any special initialization that an IP requires
2428  * after all of the have been initialized or something that needs to happen
2429  * late in the init process.
2430  * Returns 0 on success, negative error code on failure.
2431  */
2432 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2433 {
2434         struct amdgpu_gpu_instance *gpu_instance;
2435         int i = 0, r;
2436
2437         for (i = 0; i < adev->num_ip_blocks; i++) {
2438                 if (!adev->ip_blocks[i].status.hw)
2439                         continue;
2440                 if (adev->ip_blocks[i].version->funcs->late_init) {
2441                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2442                         if (r) {
2443                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2444                                           adev->ip_blocks[i].version->funcs->name, r);
2445                                 return r;
2446                         }
2447                 }
2448                 adev->ip_blocks[i].status.late_initialized = true;
2449         }
2450
2451         amdgpu_ras_set_error_query_ready(adev, true);
2452
2453         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2454         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2455
2456         amdgpu_device_fill_reset_magic(adev);
2457
2458         r = amdgpu_device_enable_mgpu_fan_boost();
2459         if (r)
2460                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2461
2462
2463         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2464                 mutex_lock(&mgpu_info.mutex);
2465
2466                 /*
2467                  * Reset device p-state to low as this was booted with high.
2468                  *
2469                  * This should be performed only after all devices from the same
2470                  * hive get initialized.
2471                  *
2472                  * However, it's unknown how many device in the hive in advance.
2473                  * As this is counted one by one during devices initializations.
2474                  *
2475                  * So, we wait for all XGMI interlinked devices initialized.
2476                  * This may bring some delays as those devices may come from
2477                  * different hives. But that should be OK.
2478                  */
2479                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2480                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2481                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2482                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2483                                         continue;
2484
2485                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2486                                                 AMDGPU_XGMI_PSTATE_MIN);
2487                                 if (r) {
2488                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2489                                         break;
2490                                 }
2491                         }
2492                 }
2493
2494                 mutex_unlock(&mgpu_info.mutex);
2495         }
2496
2497         return 0;
2498 }
2499
2500 /**
2501  * amdgpu_device_ip_fini - run fini for hardware IPs
2502  *
2503  * @adev: amdgpu_device pointer
2504  *
2505  * Main teardown pass for hardware IPs.  The list of all the hardware
2506  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2507  * are run.  hw_fini tears down the hardware associated with each IP
2508  * and sw_fini tears down any software state associated with each IP.
2509  * Returns 0 on success, negative error code on failure.
2510  */
2511 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2512 {
2513         int i, r;
2514
2515         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2516                 amdgpu_virt_release_ras_err_handler_data(adev);
2517
2518         amdgpu_ras_pre_fini(adev);
2519
2520         if (adev->gmc.xgmi.num_physical_nodes > 1)
2521                 amdgpu_xgmi_remove_device(adev);
2522
2523         amdgpu_amdkfd_device_fini(adev);
2524
2525         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2526         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2527
2528         /* need to disable SMC first */
2529         for (i = 0; i < adev->num_ip_blocks; i++) {
2530                 if (!adev->ip_blocks[i].status.hw)
2531                         continue;
2532                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2533                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2534                         /* XXX handle errors */
2535                         if (r) {
2536                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2537                                           adev->ip_blocks[i].version->funcs->name, r);
2538                         }
2539                         adev->ip_blocks[i].status.hw = false;
2540                         break;
2541                 }
2542         }
2543
2544         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2545                 if (!adev->ip_blocks[i].status.hw)
2546                         continue;
2547
2548                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2549                 /* XXX handle errors */
2550                 if (r) {
2551                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2552                                   adev->ip_blocks[i].version->funcs->name, r);
2553                 }
2554
2555                 adev->ip_blocks[i].status.hw = false;
2556         }
2557
2558
2559         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2560                 if (!adev->ip_blocks[i].status.sw)
2561                         continue;
2562
2563                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2564                         amdgpu_ucode_free_bo(adev);
2565                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2566                         amdgpu_device_wb_fini(adev);
2567                         amdgpu_device_vram_scratch_fini(adev);
2568                         amdgpu_ib_pool_fini(adev);
2569                 }
2570
2571                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2572                 /* XXX handle errors */
2573                 if (r) {
2574                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2575                                   adev->ip_blocks[i].version->funcs->name, r);
2576                 }
2577                 adev->ip_blocks[i].status.sw = false;
2578                 adev->ip_blocks[i].status.valid = false;
2579         }
2580
2581         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2582                 if (!adev->ip_blocks[i].status.late_initialized)
2583                         continue;
2584                 if (adev->ip_blocks[i].version->funcs->late_fini)
2585                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2586                 adev->ip_blocks[i].status.late_initialized = false;
2587         }
2588
2589         amdgpu_ras_fini(adev);
2590
2591         if (amdgpu_sriov_vf(adev))
2592                 if (amdgpu_virt_release_full_gpu(adev, false))
2593                         DRM_ERROR("failed to release exclusive mode on fini\n");
2594
2595         return 0;
2596 }
2597
2598 /**
2599  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2600  *
2601  * @work: work_struct.
2602  */
2603 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2604 {
2605         struct amdgpu_device *adev =
2606                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2607         int r;
2608
2609         r = amdgpu_ib_ring_tests(adev);
2610         if (r)
2611                 DRM_ERROR("ib ring test failed (%d).\n", r);
2612 }
2613
2614 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2615 {
2616         struct amdgpu_device *adev =
2617                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2618
2619         mutex_lock(&adev->gfx.gfx_off_mutex);
2620         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2621                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2622                         adev->gfx.gfx_off_state = true;
2623         }
2624         mutex_unlock(&adev->gfx.gfx_off_mutex);
2625 }
2626
2627 /**
2628  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2629  *
2630  * @adev: amdgpu_device pointer
2631  *
2632  * Main suspend function for hardware IPs.  The list of all the hardware
2633  * IPs that make up the asic is walked, clockgating is disabled and the
2634  * suspend callbacks are run.  suspend puts the hardware and software state
2635  * in each IP into a state suitable for suspend.
2636  * Returns 0 on success, negative error code on failure.
2637  */
2638 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2639 {
2640         int i, r;
2641
2642         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2643         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2644
2645         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2646                 if (!adev->ip_blocks[i].status.valid)
2647                         continue;
2648
2649                 /* displays are handled separately */
2650                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2651                         continue;
2652
2653                 /* XXX handle errors */
2654                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2655                 /* XXX handle errors */
2656                 if (r) {
2657                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2658                                   adev->ip_blocks[i].version->funcs->name, r);
2659                         return r;
2660                 }
2661
2662                 adev->ip_blocks[i].status.hw = false;
2663         }
2664
2665         return 0;
2666 }
2667
2668 /**
2669  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2670  *
2671  * @adev: amdgpu_device pointer
2672  *
2673  * Main suspend function for hardware IPs.  The list of all the hardware
2674  * IPs that make up the asic is walked, clockgating is disabled and the
2675  * suspend callbacks are run.  suspend puts the hardware and software state
2676  * in each IP into a state suitable for suspend.
2677  * Returns 0 on success, negative error code on failure.
2678  */
2679 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2680 {
2681         int i, r;
2682
2683         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2684                 if (!adev->ip_blocks[i].status.valid)
2685                         continue;
2686                 /* displays are handled in phase1 */
2687                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2688                         continue;
2689                 /* PSP lost connection when err_event_athub occurs */
2690                 if (amdgpu_ras_intr_triggered() &&
2691                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2692                         adev->ip_blocks[i].status.hw = false;
2693                         continue;
2694                 }
2695                 /* XXX handle errors */
2696                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2697                 /* XXX handle errors */
2698                 if (r) {
2699                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2700                                   adev->ip_blocks[i].version->funcs->name, r);
2701                 }
2702                 adev->ip_blocks[i].status.hw = false;
2703                 /* handle putting the SMC in the appropriate state */
2704                 if(!amdgpu_sriov_vf(adev)){
2705                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2706                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2707                                 if (r) {
2708                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2709                                                         adev->mp1_state, r);
2710                                         return r;
2711                                 }
2712                         }
2713                 }
2714                 adev->ip_blocks[i].status.hw = false;
2715         }
2716
2717         return 0;
2718 }
2719
2720 /**
2721  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2722  *
2723  * @adev: amdgpu_device pointer
2724  *
2725  * Main suspend function for hardware IPs.  The list of all the hardware
2726  * IPs that make up the asic is walked, clockgating is disabled and the
2727  * suspend callbacks are run.  suspend puts the hardware and software state
2728  * in each IP into a state suitable for suspend.
2729  * Returns 0 on success, negative error code on failure.
2730  */
2731 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2732 {
2733         int r;
2734
2735         if (amdgpu_sriov_vf(adev))
2736                 amdgpu_virt_request_full_gpu(adev, false);
2737
2738         r = amdgpu_device_ip_suspend_phase1(adev);
2739         if (r)
2740                 return r;
2741         r = amdgpu_device_ip_suspend_phase2(adev);
2742
2743         if (amdgpu_sriov_vf(adev))
2744                 amdgpu_virt_release_full_gpu(adev, false);
2745
2746         return r;
2747 }
2748
2749 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2750 {
2751         int i, r;
2752
2753         static enum amd_ip_block_type ip_order[] = {
2754                 AMD_IP_BLOCK_TYPE_GMC,
2755                 AMD_IP_BLOCK_TYPE_COMMON,
2756                 AMD_IP_BLOCK_TYPE_PSP,
2757                 AMD_IP_BLOCK_TYPE_IH,
2758         };
2759
2760         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2761                 int j;
2762                 struct amdgpu_ip_block *block;
2763
2764                 block = &adev->ip_blocks[i];
2765                 block->status.hw = false;
2766
2767                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2768
2769                         if (block->version->type != ip_order[j] ||
2770                                 !block->status.valid)
2771                                 continue;
2772
2773                         r = block->version->funcs->hw_init(adev);
2774                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2775                         if (r)
2776                                 return r;
2777                         block->status.hw = true;
2778                 }
2779         }
2780
2781         return 0;
2782 }
2783
2784 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2785 {
2786         int i, r;
2787
2788         static enum amd_ip_block_type ip_order[] = {
2789                 AMD_IP_BLOCK_TYPE_SMC,
2790                 AMD_IP_BLOCK_TYPE_DCE,
2791                 AMD_IP_BLOCK_TYPE_GFX,
2792                 AMD_IP_BLOCK_TYPE_SDMA,
2793                 AMD_IP_BLOCK_TYPE_UVD,
2794                 AMD_IP_BLOCK_TYPE_VCE,
2795                 AMD_IP_BLOCK_TYPE_VCN
2796         };
2797
2798         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2799                 int j;
2800                 struct amdgpu_ip_block *block;
2801
2802                 for (j = 0; j < adev->num_ip_blocks; j++) {
2803                         block = &adev->ip_blocks[j];
2804
2805                         if (block->version->type != ip_order[i] ||
2806                                 !block->status.valid ||
2807                                 block->status.hw)
2808                                 continue;
2809
2810                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2811                                 r = block->version->funcs->resume(adev);
2812                         else
2813                                 r = block->version->funcs->hw_init(adev);
2814
2815                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2816                         if (r)
2817                                 return r;
2818                         block->status.hw = true;
2819                 }
2820         }
2821
2822         return 0;
2823 }
2824
2825 /**
2826  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2827  *
2828  * @adev: amdgpu_device pointer
2829  *
2830  * First resume function for hardware IPs.  The list of all the hardware
2831  * IPs that make up the asic is walked and the resume callbacks are run for
2832  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2833  * after a suspend and updates the software state as necessary.  This
2834  * function is also used for restoring the GPU after a GPU reset.
2835  * Returns 0 on success, negative error code on failure.
2836  */
2837 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2838 {
2839         int i, r;
2840
2841         for (i = 0; i < adev->num_ip_blocks; i++) {
2842                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2843                         continue;
2844                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2845                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2846                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2847
2848                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2849                         if (r) {
2850                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2851                                           adev->ip_blocks[i].version->funcs->name, r);
2852                                 return r;
2853                         }
2854                         adev->ip_blocks[i].status.hw = true;
2855                 }
2856         }
2857
2858         return 0;
2859 }
2860
2861 /**
2862  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2863  *
2864  * @adev: amdgpu_device pointer
2865  *
2866  * First resume function for hardware IPs.  The list of all the hardware
2867  * IPs that make up the asic is walked and the resume callbacks are run for
2868  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2869  * functional state after a suspend and updates the software state as
2870  * necessary.  This function is also used for restoring the GPU after a GPU
2871  * reset.
2872  * Returns 0 on success, negative error code on failure.
2873  */
2874 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2875 {
2876         int i, r;
2877
2878         for (i = 0; i < adev->num_ip_blocks; i++) {
2879                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2880                         continue;
2881                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2882                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2883                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2884                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2885                         continue;
2886                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2887                 if (r) {
2888                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2889                                   adev->ip_blocks[i].version->funcs->name, r);
2890                         return r;
2891                 }
2892                 adev->ip_blocks[i].status.hw = true;
2893         }
2894
2895         return 0;
2896 }
2897
2898 /**
2899  * amdgpu_device_ip_resume - run resume for hardware IPs
2900  *
2901  * @adev: amdgpu_device pointer
2902  *
2903  * Main resume function for hardware IPs.  The hardware IPs
2904  * are split into two resume functions because they are
2905  * are also used in in recovering from a GPU reset and some additional
2906  * steps need to be take between them.  In this case (S3/S4) they are
2907  * run sequentially.
2908  * Returns 0 on success, negative error code on failure.
2909  */
2910 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2911 {
2912         int r;
2913
2914         r = amdgpu_device_ip_resume_phase1(adev);
2915         if (r)
2916                 return r;
2917
2918         r = amdgpu_device_fw_loading(adev);
2919         if (r)
2920                 return r;
2921
2922         r = amdgpu_device_ip_resume_phase2(adev);
2923
2924         return r;
2925 }
2926
2927 /**
2928  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2929  *
2930  * @adev: amdgpu_device pointer
2931  *
2932  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2933  */
2934 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2935 {
2936         if (amdgpu_sriov_vf(adev)) {
2937                 if (adev->is_atom_fw) {
2938                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2939                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2940                 } else {
2941                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2942                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2943                 }
2944
2945                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2946                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2947         }
2948 }
2949
2950 /**
2951  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2952  *
2953  * @asic_type: AMD asic type
2954  *
2955  * Check if there is DC (new modesetting infrastructre) support for an asic.
2956  * returns true if DC has support, false if not.
2957  */
2958 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2959 {
2960         switch (asic_type) {
2961 #if defined(CONFIG_DRM_AMD_DC)
2962 #if defined(CONFIG_DRM_AMD_DC_SI)
2963         case CHIP_TAHITI:
2964         case CHIP_PITCAIRN:
2965         case CHIP_VERDE:
2966         case CHIP_OLAND:
2967 #endif
2968         case CHIP_BONAIRE:
2969         case CHIP_KAVERI:
2970         case CHIP_KABINI:
2971         case CHIP_MULLINS:
2972                 /*
2973                  * We have systems in the wild with these ASICs that require
2974                  * LVDS and VGA support which is not supported with DC.
2975                  *
2976                  * Fallback to the non-DC driver here by default so as not to
2977                  * cause regressions.
2978                  */
2979                 return amdgpu_dc > 0;
2980         case CHIP_HAWAII:
2981         case CHIP_CARRIZO:
2982         case CHIP_STONEY:
2983         case CHIP_POLARIS10:
2984         case CHIP_POLARIS11:
2985         case CHIP_POLARIS12:
2986         case CHIP_VEGAM:
2987         case CHIP_TONGA:
2988         case CHIP_FIJI:
2989         case CHIP_VEGA10:
2990         case CHIP_VEGA12:
2991         case CHIP_VEGA20:
2992 #if defined(CONFIG_DRM_AMD_DC_DCN)
2993         case CHIP_RAVEN:
2994         case CHIP_NAVI10:
2995         case CHIP_NAVI14:
2996         case CHIP_NAVI12:
2997         case CHIP_RENOIR:
2998 #endif
2999 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3000         case CHIP_SIENNA_CICHLID:
3001         case CHIP_NAVY_FLOUNDER:
3002 #endif
3003                 return amdgpu_dc != 0;
3004 #endif
3005         default:
3006                 if (amdgpu_dc > 0)
3007                         DRM_INFO("Display Core has been requested via kernel parameter "
3008                                          "but isn't supported by ASIC, ignoring\n");
3009                 return false;
3010         }
3011 }
3012
3013 /**
3014  * amdgpu_device_has_dc_support - check if dc is supported
3015  *
3016  * @adev: amdgpu_device pointer
3017  *
3018  * Returns true for supported, false for not supported
3019  */
3020 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3021 {
3022         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3023                 return false;
3024
3025         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3026 }
3027
3028
3029 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3030 {
3031         struct amdgpu_device *adev =
3032                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3033         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3034
3035         /* It's a bug to not have a hive within this function */
3036         if (WARN_ON(!hive))
3037                 return;
3038
3039         /*
3040          * Use task barrier to synchronize all xgmi reset works across the
3041          * hive. task_barrier_enter and task_barrier_exit will block
3042          * until all the threads running the xgmi reset works reach
3043          * those points. task_barrier_full will do both blocks.
3044          */
3045         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3046
3047                 task_barrier_enter(&hive->tb);
3048                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3049
3050                 if (adev->asic_reset_res)
3051                         goto fail;
3052
3053                 task_barrier_exit(&hive->tb);
3054                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3055
3056                 if (adev->asic_reset_res)
3057                         goto fail;
3058
3059                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3060                         adev->mmhub.funcs->reset_ras_error_count(adev);
3061         } else {
3062
3063                 task_barrier_full(&hive->tb);
3064                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3065         }
3066
3067 fail:
3068         if (adev->asic_reset_res)
3069                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3070                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3071         amdgpu_put_xgmi_hive(hive);
3072 }
3073
3074 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3075 {
3076         char *input = amdgpu_lockup_timeout;
3077         char *timeout_setting = NULL;
3078         int index = 0;
3079         long timeout;
3080         int ret = 0;
3081
3082         /*
3083          * By default timeout for non compute jobs is 10000.
3084          * And there is no timeout enforced on compute jobs.
3085          * In SR-IOV or passthrough mode, timeout for compute
3086          * jobs are 60000 by default.
3087          */
3088         adev->gfx_timeout = msecs_to_jiffies(10000);
3089         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3090         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3091                 adev->compute_timeout =  msecs_to_jiffies(60000);
3092         else
3093                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3094
3095         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3096                 while ((timeout_setting = strsep(&input, ",")) &&
3097                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3098                         ret = kstrtol(timeout_setting, 0, &timeout);
3099                         if (ret)
3100                                 return ret;
3101
3102                         if (timeout == 0) {
3103                                 index++;
3104                                 continue;
3105                         } else if (timeout < 0) {
3106                                 timeout = MAX_SCHEDULE_TIMEOUT;
3107                         } else {
3108                                 timeout = msecs_to_jiffies(timeout);
3109                         }
3110
3111                         switch (index++) {
3112                         case 0:
3113                                 adev->gfx_timeout = timeout;
3114                                 break;
3115                         case 1:
3116                                 adev->compute_timeout = timeout;
3117                                 break;
3118                         case 2:
3119                                 adev->sdma_timeout = timeout;
3120                                 break;
3121                         case 3:
3122                                 adev->video_timeout = timeout;
3123                                 break;
3124                         default:
3125                                 break;
3126                         }
3127                 }
3128                 /*
3129                  * There is only one value specified and
3130                  * it should apply to all non-compute jobs.
3131                  */
3132                 if (index == 1) {
3133                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3134                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3135                                 adev->compute_timeout = adev->gfx_timeout;
3136                 }
3137         }
3138
3139         return ret;
3140 }
3141
3142 static const struct attribute *amdgpu_dev_attributes[] = {
3143         &dev_attr_product_name.attr,
3144         &dev_attr_product_number.attr,
3145         &dev_attr_serial_number.attr,
3146         &dev_attr_pcie_replay_count.attr,
3147         NULL
3148 };
3149
3150
3151 /**
3152  * amdgpu_device_init - initialize the driver
3153  *
3154  * @adev: amdgpu_device pointer
3155  * @flags: driver flags
3156  *
3157  * Initializes the driver info and hw (all asics).
3158  * Returns 0 for success or an error on failure.
3159  * Called at driver startup.
3160  */
3161 int amdgpu_device_init(struct amdgpu_device *adev,
3162                        uint32_t flags)
3163 {
3164         struct drm_device *ddev = adev_to_drm(adev);
3165         struct pci_dev *pdev = adev->pdev;
3166         int r, i;
3167         bool boco = false;
3168         u32 max_MBps;
3169
3170         adev->shutdown = false;
3171         adev->flags = flags;
3172
3173         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3174                 adev->asic_type = amdgpu_force_asic_type;
3175         else
3176                 adev->asic_type = flags & AMD_ASIC_MASK;
3177
3178         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3179         if (amdgpu_emu_mode == 1)
3180                 adev->usec_timeout *= 10;
3181         adev->gmc.gart_size = 512 * 1024 * 1024;
3182         adev->accel_working = false;
3183         adev->num_rings = 0;
3184         adev->mman.buffer_funcs = NULL;
3185         adev->mman.buffer_funcs_ring = NULL;
3186         adev->vm_manager.vm_pte_funcs = NULL;
3187         adev->vm_manager.vm_pte_num_scheds = 0;
3188         adev->gmc.gmc_funcs = NULL;
3189         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3190         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3191
3192         adev->smc_rreg = &amdgpu_invalid_rreg;
3193         adev->smc_wreg = &amdgpu_invalid_wreg;
3194         adev->pcie_rreg = &amdgpu_invalid_rreg;
3195         adev->pcie_wreg = &amdgpu_invalid_wreg;
3196         adev->pciep_rreg = &amdgpu_invalid_rreg;
3197         adev->pciep_wreg = &amdgpu_invalid_wreg;
3198         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3199         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3200         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3201         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3202         adev->didt_rreg = &amdgpu_invalid_rreg;
3203         adev->didt_wreg = &amdgpu_invalid_wreg;
3204         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3205         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3206         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3207         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3208
3209         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3210                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3211                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3212
3213         /* mutex initialization are all done here so we
3214          * can recall function without having locking issues */
3215         atomic_set(&adev->irq.ih.lock, 0);
3216         mutex_init(&adev->firmware.mutex);
3217         mutex_init(&adev->pm.mutex);
3218         mutex_init(&adev->gfx.gpu_clock_mutex);
3219         mutex_init(&adev->srbm_mutex);
3220         mutex_init(&adev->gfx.pipe_reserve_mutex);
3221         mutex_init(&adev->gfx.gfx_off_mutex);
3222         mutex_init(&adev->grbm_idx_mutex);
3223         mutex_init(&adev->mn_lock);
3224         mutex_init(&adev->virt.vf_errors.lock);
3225         hash_init(adev->mn_hash);
3226         atomic_set(&adev->in_gpu_reset, 0);
3227         init_rwsem(&adev->reset_sem);
3228         mutex_init(&adev->psp.mutex);
3229         mutex_init(&adev->notifier_lock);
3230
3231         r = amdgpu_device_check_arguments(adev);
3232         if (r)
3233                 return r;
3234
3235         spin_lock_init(&adev->mmio_idx_lock);
3236         spin_lock_init(&adev->smc_idx_lock);
3237         spin_lock_init(&adev->pcie_idx_lock);
3238         spin_lock_init(&adev->uvd_ctx_idx_lock);
3239         spin_lock_init(&adev->didt_idx_lock);
3240         spin_lock_init(&adev->gc_cac_idx_lock);
3241         spin_lock_init(&adev->se_cac_idx_lock);
3242         spin_lock_init(&adev->audio_endpt_idx_lock);
3243         spin_lock_init(&adev->mm_stats.lock);
3244
3245         INIT_LIST_HEAD(&adev->shadow_list);
3246         mutex_init(&adev->shadow_list_lock);
3247
3248         INIT_DELAYED_WORK(&adev->delayed_init_work,
3249                           amdgpu_device_delayed_init_work_handler);
3250         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3251                           amdgpu_device_delay_enable_gfx_off);
3252
3253         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3254
3255         adev->gfx.gfx_off_req_count = 1;
3256         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3257
3258         atomic_set(&adev->throttling_logging_enabled, 1);
3259         /*
3260          * If throttling continues, logging will be performed every minute
3261          * to avoid log flooding. "-1" is subtracted since the thermal
3262          * throttling interrupt comes every second. Thus, the total logging
3263          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3264          * for throttling interrupt) = 60 seconds.
3265          */
3266         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3267         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3268
3269         /* Registers mapping */
3270         /* TODO: block userspace mapping of io register */
3271         if (adev->asic_type >= CHIP_BONAIRE) {
3272                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3273                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3274         } else {
3275                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3276                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3277         }
3278
3279         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3280         if (adev->rmmio == NULL) {
3281                 return -ENOMEM;
3282         }
3283         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3284         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3285
3286         /* io port mapping */
3287         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3288                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3289                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3290                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3291                         break;
3292                 }
3293         }
3294         if (adev->rio_mem == NULL)
3295                 DRM_INFO("PCI I/O BAR is not found.\n");
3296
3297         /* enable PCIE atomic ops */
3298         r = pci_enable_atomic_ops_to_root(adev->pdev,
3299                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3300                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3301         if (r) {
3302                 adev->have_atomics_support = false;
3303                 DRM_INFO("PCIE atomic ops is not supported\n");
3304         } else {
3305                 adev->have_atomics_support = true;
3306         }
3307
3308         amdgpu_device_get_pcie_info(adev);
3309
3310         if (amdgpu_mcbp)
3311                 DRM_INFO("MCBP is enabled\n");
3312
3313         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3314                 adev->enable_mes = true;
3315
3316         /* detect hw virtualization here */
3317         amdgpu_detect_virtualization(adev);
3318
3319         r = amdgpu_device_get_job_timeout_settings(adev);
3320         if (r) {
3321                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3322                 goto failed_unmap;
3323         }
3324
3325         /* early init functions */
3326         r = amdgpu_device_ip_early_init(adev);
3327         if (r)
3328                 goto failed_unmap;
3329
3330         /* doorbell bar mapping and doorbell index init*/
3331         amdgpu_device_doorbell_init(adev);
3332
3333         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3334         /* this will fail for cards that aren't VGA class devices, just
3335          * ignore it */
3336         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3337
3338         if (amdgpu_device_supports_boco(ddev))
3339                 boco = true;
3340         if (amdgpu_has_atpx() &&
3341             (amdgpu_is_atpx_hybrid() ||
3342              amdgpu_has_atpx_dgpu_power_cntl()) &&
3343             !pci_is_thunderbolt_attached(adev->pdev))
3344                 vga_switcheroo_register_client(adev->pdev,
3345                                                &amdgpu_switcheroo_ops, boco);
3346         if (boco)
3347                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3348
3349         if (amdgpu_emu_mode == 1) {
3350                 /* post the asic on emulation mode */
3351                 emu_soc_asic_init(adev);
3352                 goto fence_driver_init;
3353         }
3354
3355         /* detect if we are with an SRIOV vbios */
3356         amdgpu_device_detect_sriov_bios(adev);
3357
3358         /* check if we need to reset the asic
3359          *  E.g., driver was not cleanly unloaded previously, etc.
3360          */
3361         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3362                 r = amdgpu_asic_reset(adev);
3363                 if (r) {
3364                         dev_err(adev->dev, "asic reset on init failed\n");
3365                         goto failed;
3366                 }
3367         }
3368
3369         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3370
3371         /* Post card if necessary */
3372         if (amdgpu_device_need_post(adev)) {
3373                 if (!adev->bios) {
3374                         dev_err(adev->dev, "no vBIOS found\n");
3375                         r = -EINVAL;
3376                         goto failed;
3377                 }
3378                 DRM_INFO("GPU posting now...\n");
3379                 r = amdgpu_device_asic_init(adev);
3380                 if (r) {
3381                         dev_err(adev->dev, "gpu post error!\n");
3382                         goto failed;
3383                 }
3384         }
3385
3386         if (adev->is_atom_fw) {
3387                 /* Initialize clocks */
3388                 r = amdgpu_atomfirmware_get_clock_info(adev);
3389                 if (r) {
3390                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3391                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3392                         goto failed;
3393                 }
3394         } else {
3395                 /* Initialize clocks */
3396                 r = amdgpu_atombios_get_clock_info(adev);
3397                 if (r) {
3398                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3399                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3400                         goto failed;
3401                 }
3402                 /* init i2c buses */
3403                 if (!amdgpu_device_has_dc_support(adev))
3404                         amdgpu_atombios_i2c_init(adev);
3405         }
3406
3407 fence_driver_init:
3408         /* Fence driver */
3409         r = amdgpu_fence_driver_init(adev);
3410         if (r) {
3411                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3412                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3413                 goto failed;
3414         }
3415
3416         /* init the mode config */
3417         drm_mode_config_init(adev_to_drm(adev));
3418
3419         r = amdgpu_device_ip_init(adev);
3420         if (r) {
3421                 /* failed in exclusive mode due to timeout */
3422                 if (amdgpu_sriov_vf(adev) &&
3423                     !amdgpu_sriov_runtime(adev) &&
3424                     amdgpu_virt_mmio_blocked(adev) &&
3425                     !amdgpu_virt_wait_reset(adev)) {
3426                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3427                         /* Don't send request since VF is inactive. */
3428                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3429                         adev->virt.ops = NULL;
3430                         r = -EAGAIN;
3431                         goto failed;
3432                 }
3433                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3434                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3435                 goto failed;
3436         }
3437
3438         dev_info(adev->dev,
3439                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3440                         adev->gfx.config.max_shader_engines,
3441                         adev->gfx.config.max_sh_per_se,
3442                         adev->gfx.config.max_cu_per_sh,
3443                         adev->gfx.cu_info.number);
3444
3445         adev->accel_working = true;
3446
3447         amdgpu_vm_check_compute_bug(adev);
3448
3449         /* Initialize the buffer migration limit. */
3450         if (amdgpu_moverate >= 0)
3451                 max_MBps = amdgpu_moverate;
3452         else
3453                 max_MBps = 8; /* Allow 8 MB/s. */
3454         /* Get a log2 for easy divisions. */
3455         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3456
3457         amdgpu_fbdev_init(adev);
3458
3459         r = amdgpu_pm_sysfs_init(adev);
3460         if (r) {
3461                 adev->pm_sysfs_en = false;
3462                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3463         } else
3464                 adev->pm_sysfs_en = true;
3465
3466         r = amdgpu_ucode_sysfs_init(adev);
3467         if (r) {
3468                 adev->ucode_sysfs_en = false;
3469                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3470         } else
3471                 adev->ucode_sysfs_en = true;
3472
3473         if ((amdgpu_testing & 1)) {
3474                 if (adev->accel_working)
3475                         amdgpu_test_moves(adev);
3476                 else
3477                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3478         }
3479         if (amdgpu_benchmarking) {
3480                 if (adev->accel_working)
3481                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3482                 else
3483                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3484         }
3485
3486         /*
3487          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3488          * Otherwise the mgpu fan boost feature will be skipped due to the
3489          * gpu instance is counted less.
3490          */
3491         amdgpu_register_gpu_instance(adev);
3492
3493         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3494          * explicit gating rather than handling it automatically.
3495          */
3496         r = amdgpu_device_ip_late_init(adev);
3497         if (r) {
3498                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3499                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3500                 goto failed;
3501         }
3502
3503         /* must succeed. */
3504         amdgpu_ras_resume(adev);
3505
3506         queue_delayed_work(system_wq, &adev->delayed_init_work,
3507                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3508
3509         if (amdgpu_sriov_vf(adev))
3510                 flush_delayed_work(&adev->delayed_init_work);
3511
3512         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3513         if (r)
3514                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3515
3516         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3517                 r = amdgpu_pmu_init(adev);
3518         if (r)
3519                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3520
3521         /* Have stored pci confspace at hand for restore in sudden PCI error */
3522         if (amdgpu_device_cache_pci_state(adev->pdev))
3523                 pci_restore_state(pdev);
3524
3525         return 0;
3526
3527 failed:
3528         amdgpu_vf_error_trans_all(adev);
3529         if (boco)
3530                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3531
3532 failed_unmap:
3533         iounmap(adev->rmmio);
3534         adev->rmmio = NULL;
3535
3536         return r;
3537 }
3538
3539 /**
3540  * amdgpu_device_fini - tear down the driver
3541  *
3542  * @adev: amdgpu_device pointer
3543  *
3544  * Tear down the driver info (all asics).
3545  * Called at driver shutdown.
3546  */
3547 void amdgpu_device_fini(struct amdgpu_device *adev)
3548 {
3549         dev_info(adev->dev, "amdgpu: finishing device.\n");
3550         flush_delayed_work(&adev->delayed_init_work);
3551         adev->shutdown = true;
3552
3553         kfree(adev->pci_state);
3554
3555         /* make sure IB test finished before entering exclusive mode
3556          * to avoid preemption on IB test
3557          * */
3558         if (amdgpu_sriov_vf(adev)) {
3559                 amdgpu_virt_request_full_gpu(adev, false);
3560                 amdgpu_virt_fini_data_exchange(adev);
3561         }
3562
3563         /* disable all interrupts */
3564         amdgpu_irq_disable_all(adev);
3565         if (adev->mode_info.mode_config_initialized){
3566                 if (!amdgpu_device_has_dc_support(adev))
3567                         drm_helper_force_disable_all(adev_to_drm(adev));
3568                 else
3569                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3570         }
3571         amdgpu_fence_driver_fini(adev);
3572         if (adev->pm_sysfs_en)
3573                 amdgpu_pm_sysfs_fini(adev);
3574         amdgpu_fbdev_fini(adev);
3575         amdgpu_device_ip_fini(adev);
3576         release_firmware(adev->firmware.gpu_info_fw);
3577         adev->firmware.gpu_info_fw = NULL;
3578         adev->accel_working = false;
3579         /* free i2c buses */
3580         if (!amdgpu_device_has_dc_support(adev))
3581                 amdgpu_i2c_fini(adev);
3582
3583         if (amdgpu_emu_mode != 1)
3584                 amdgpu_atombios_fini(adev);
3585
3586         kfree(adev->bios);
3587         adev->bios = NULL;
3588         if (amdgpu_has_atpx() &&
3589             (amdgpu_is_atpx_hybrid() ||
3590              amdgpu_has_atpx_dgpu_power_cntl()) &&
3591             !pci_is_thunderbolt_attached(adev->pdev))
3592                 vga_switcheroo_unregister_client(adev->pdev);
3593         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3594                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3595         vga_client_register(adev->pdev, NULL, NULL, NULL);
3596         if (adev->rio_mem)
3597                 pci_iounmap(adev->pdev, adev->rio_mem);
3598         adev->rio_mem = NULL;
3599         iounmap(adev->rmmio);
3600         adev->rmmio = NULL;
3601         amdgpu_device_doorbell_fini(adev);
3602
3603         if (adev->ucode_sysfs_en)
3604                 amdgpu_ucode_sysfs_fini(adev);
3605
3606         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3607         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3608                 amdgpu_pmu_fini(adev);
3609         if (adev->mman.discovery_bin)
3610                 amdgpu_discovery_fini(adev);
3611 }
3612
3613
3614 /*
3615  * Suspend & resume.
3616  */
3617 /**
3618  * amdgpu_device_suspend - initiate device suspend
3619  *
3620  * @dev: drm dev pointer
3621  * @fbcon : notify the fbdev of suspend
3622  *
3623  * Puts the hw in the suspend state (all asics).
3624  * Returns 0 for success or an error on failure.
3625  * Called at driver suspend.
3626  */
3627 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3628 {
3629         struct amdgpu_device *adev;
3630         struct drm_crtc *crtc;
3631         struct drm_connector *connector;
3632         struct drm_connector_list_iter iter;
3633         int r;
3634
3635         adev = drm_to_adev(dev);
3636
3637         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3638                 return 0;
3639
3640         adev->in_suspend = true;
3641         drm_kms_helper_poll_disable(dev);
3642
3643         if (fbcon)
3644                 amdgpu_fbdev_set_suspend(adev, 1);
3645
3646         cancel_delayed_work_sync(&adev->delayed_init_work);
3647
3648         if (!amdgpu_device_has_dc_support(adev)) {
3649                 /* turn off display hw */
3650                 drm_modeset_lock_all(dev);
3651                 drm_connector_list_iter_begin(dev, &iter);
3652                 drm_for_each_connector_iter(connector, &iter)
3653                         drm_helper_connector_dpms(connector,
3654                                                   DRM_MODE_DPMS_OFF);
3655                 drm_connector_list_iter_end(&iter);
3656                 drm_modeset_unlock_all(dev);
3657                         /* unpin the front buffers and cursors */
3658                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3659                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3660                         struct drm_framebuffer *fb = crtc->primary->fb;
3661                         struct amdgpu_bo *robj;
3662
3663                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3664                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3665                                 r = amdgpu_bo_reserve(aobj, true);
3666                                 if (r == 0) {
3667                                         amdgpu_bo_unpin(aobj);
3668                                         amdgpu_bo_unreserve(aobj);
3669                                 }
3670                         }
3671
3672                         if (fb == NULL || fb->obj[0] == NULL) {
3673                                 continue;
3674                         }
3675                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3676                         /* don't unpin kernel fb objects */
3677                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3678                                 r = amdgpu_bo_reserve(robj, true);
3679                                 if (r == 0) {
3680                                         amdgpu_bo_unpin(robj);
3681                                         amdgpu_bo_unreserve(robj);
3682                                 }
3683                         }
3684                 }
3685         }
3686
3687         amdgpu_ras_suspend(adev);
3688
3689         r = amdgpu_device_ip_suspend_phase1(adev);
3690
3691         amdgpu_amdkfd_suspend(adev, !fbcon);
3692
3693         /* evict vram memory */
3694         amdgpu_bo_evict_vram(adev);
3695
3696         amdgpu_fence_driver_suspend(adev);
3697
3698         r = amdgpu_device_ip_suspend_phase2(adev);
3699
3700         /* evict remaining vram memory
3701          * This second call to evict vram is to evict the gart page table
3702          * using the CPU.
3703          */
3704         amdgpu_bo_evict_vram(adev);
3705
3706         return 0;
3707 }
3708
3709 /**
3710  * amdgpu_device_resume - initiate device resume
3711  *
3712  * @dev: drm dev pointer
3713  * @fbcon : notify the fbdev of resume
3714  *
3715  * Bring the hw back to operating state (all asics).
3716  * Returns 0 for success or an error on failure.
3717  * Called at driver resume.
3718  */
3719 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3720 {
3721         struct drm_connector *connector;
3722         struct drm_connector_list_iter iter;
3723         struct amdgpu_device *adev = drm_to_adev(dev);
3724         struct drm_crtc *crtc;
3725         int r = 0;
3726
3727         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3728                 return 0;
3729
3730         /* post card */
3731         if (amdgpu_device_need_post(adev)) {
3732                 r = amdgpu_device_asic_init(adev);
3733                 if (r)
3734                         dev_err(adev->dev, "amdgpu asic init failed\n");
3735         }
3736
3737         r = amdgpu_device_ip_resume(adev);
3738         if (r) {
3739                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3740                 return r;
3741         }
3742         amdgpu_fence_driver_resume(adev);
3743
3744
3745         r = amdgpu_device_ip_late_init(adev);
3746         if (r)
3747                 return r;
3748
3749         queue_delayed_work(system_wq, &adev->delayed_init_work,
3750                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3751
3752         if (!amdgpu_device_has_dc_support(adev)) {
3753                 /* pin cursors */
3754                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3755                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3756
3757                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3758                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3759                                 r = amdgpu_bo_reserve(aobj, true);
3760                                 if (r == 0) {
3761                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3762                                         if (r != 0)
3763                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3764                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3765                                         amdgpu_bo_unreserve(aobj);
3766                                 }
3767                         }
3768                 }
3769         }
3770         r = amdgpu_amdkfd_resume(adev, !fbcon);
3771         if (r)
3772                 return r;
3773
3774         /* Make sure IB tests flushed */
3775         flush_delayed_work(&adev->delayed_init_work);
3776
3777         /* blat the mode back in */
3778         if (fbcon) {
3779                 if (!amdgpu_device_has_dc_support(adev)) {
3780                         /* pre DCE11 */
3781                         drm_helper_resume_force_mode(dev);
3782
3783                         /* turn on display hw */
3784                         drm_modeset_lock_all(dev);
3785
3786                         drm_connector_list_iter_begin(dev, &iter);
3787                         drm_for_each_connector_iter(connector, &iter)
3788                                 drm_helper_connector_dpms(connector,
3789                                                           DRM_MODE_DPMS_ON);
3790                         drm_connector_list_iter_end(&iter);
3791
3792                         drm_modeset_unlock_all(dev);
3793                 }
3794                 amdgpu_fbdev_set_suspend(adev, 0);
3795         }
3796
3797         drm_kms_helper_poll_enable(dev);
3798
3799         amdgpu_ras_resume(adev);
3800
3801         /*
3802          * Most of the connector probing functions try to acquire runtime pm
3803          * refs to ensure that the GPU is powered on when connector polling is
3804          * performed. Since we're calling this from a runtime PM callback,
3805          * trying to acquire rpm refs will cause us to deadlock.
3806          *
3807          * Since we're guaranteed to be holding the rpm lock, it's safe to
3808          * temporarily disable the rpm helpers so this doesn't deadlock us.
3809          */
3810 #ifdef CONFIG_PM
3811         dev->dev->power.disable_depth++;
3812 #endif
3813         if (!amdgpu_device_has_dc_support(adev))
3814                 drm_helper_hpd_irq_event(dev);
3815         else
3816                 drm_kms_helper_hotplug_event(dev);
3817 #ifdef CONFIG_PM
3818         dev->dev->power.disable_depth--;
3819 #endif
3820         adev->in_suspend = false;
3821
3822         return 0;
3823 }
3824
3825 /**
3826  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3827  *
3828  * @adev: amdgpu_device pointer
3829  *
3830  * The list of all the hardware IPs that make up the asic is walked and
3831  * the check_soft_reset callbacks are run.  check_soft_reset determines
3832  * if the asic is still hung or not.
3833  * Returns true if any of the IPs are still in a hung state, false if not.
3834  */
3835 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3836 {
3837         int i;
3838         bool asic_hang = false;
3839
3840         if (amdgpu_sriov_vf(adev))
3841                 return true;
3842
3843         if (amdgpu_asic_need_full_reset(adev))
3844                 return true;
3845
3846         for (i = 0; i < adev->num_ip_blocks; i++) {
3847                 if (!adev->ip_blocks[i].status.valid)
3848                         continue;
3849                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3850                         adev->ip_blocks[i].status.hang =
3851                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3852                 if (adev->ip_blocks[i].status.hang) {
3853                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3854                         asic_hang = true;
3855                 }
3856         }
3857         return asic_hang;
3858 }
3859
3860 /**
3861  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3862  *
3863  * @adev: amdgpu_device pointer
3864  *
3865  * The list of all the hardware IPs that make up the asic is walked and the
3866  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3867  * handles any IP specific hardware or software state changes that are
3868  * necessary for a soft reset to succeed.
3869  * Returns 0 on success, negative error code on failure.
3870  */
3871 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3872 {
3873         int i, r = 0;
3874
3875         for (i = 0; i < adev->num_ip_blocks; i++) {
3876                 if (!adev->ip_blocks[i].status.valid)
3877                         continue;
3878                 if (adev->ip_blocks[i].status.hang &&
3879                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3880                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3881                         if (r)
3882                                 return r;
3883                 }
3884         }
3885
3886         return 0;
3887 }
3888
3889 /**
3890  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3891  *
3892  * @adev: amdgpu_device pointer
3893  *
3894  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3895  * reset is necessary to recover.
3896  * Returns true if a full asic reset is required, false if not.
3897  */
3898 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3899 {
3900         int i;
3901
3902         if (amdgpu_asic_need_full_reset(adev))
3903                 return true;
3904
3905         for (i = 0; i < adev->num_ip_blocks; i++) {
3906                 if (!adev->ip_blocks[i].status.valid)
3907                         continue;
3908                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3909                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3910                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3911                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3912                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3913                         if (adev->ip_blocks[i].status.hang) {
3914                                 dev_info(adev->dev, "Some block need full reset!\n");
3915                                 return true;
3916                         }
3917                 }
3918         }
3919         return false;
3920 }
3921
3922 /**
3923  * amdgpu_device_ip_soft_reset - do a soft reset
3924  *
3925  * @adev: amdgpu_device pointer
3926  *
3927  * The list of all the hardware IPs that make up the asic is walked and the
3928  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3929  * IP specific hardware or software state changes that are necessary to soft
3930  * reset the IP.
3931  * Returns 0 on success, negative error code on failure.
3932  */
3933 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3934 {
3935         int i, r = 0;
3936
3937         for (i = 0; i < adev->num_ip_blocks; i++) {
3938                 if (!adev->ip_blocks[i].status.valid)
3939                         continue;
3940                 if (adev->ip_blocks[i].status.hang &&
3941                     adev->ip_blocks[i].version->funcs->soft_reset) {
3942                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3943                         if (r)
3944                                 return r;
3945                 }
3946         }
3947
3948         return 0;
3949 }
3950
3951 /**
3952  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3953  *
3954  * @adev: amdgpu_device pointer
3955  *
3956  * The list of all the hardware IPs that make up the asic is walked and the
3957  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3958  * handles any IP specific hardware or software state changes that are
3959  * necessary after the IP has been soft reset.
3960  * Returns 0 on success, negative error code on failure.
3961  */
3962 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3963 {
3964         int i, r = 0;
3965
3966         for (i = 0; i < adev->num_ip_blocks; i++) {
3967                 if (!adev->ip_blocks[i].status.valid)
3968                         continue;
3969                 if (adev->ip_blocks[i].status.hang &&
3970                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3971                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3972                 if (r)
3973                         return r;
3974         }
3975
3976         return 0;
3977 }
3978
3979 /**
3980  * amdgpu_device_recover_vram - Recover some VRAM contents
3981  *
3982  * @adev: amdgpu_device pointer
3983  *
3984  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3985  * restore things like GPUVM page tables after a GPU reset where
3986  * the contents of VRAM might be lost.
3987  *
3988  * Returns:
3989  * 0 on success, negative error code on failure.
3990  */
3991 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3992 {
3993         struct dma_fence *fence = NULL, *next = NULL;
3994         struct amdgpu_bo *shadow;
3995         long r = 1, tmo;
3996
3997         if (amdgpu_sriov_runtime(adev))
3998                 tmo = msecs_to_jiffies(8000);
3999         else
4000                 tmo = msecs_to_jiffies(100);
4001
4002         dev_info(adev->dev, "recover vram bo from shadow start\n");
4003         mutex_lock(&adev->shadow_list_lock);
4004         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4005
4006                 /* No need to recover an evicted BO */
4007                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4008                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4009                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4010                         continue;
4011
4012                 r = amdgpu_bo_restore_shadow(shadow, &next);
4013                 if (r)
4014                         break;
4015
4016                 if (fence) {
4017                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4018                         dma_fence_put(fence);
4019                         fence = next;
4020                         if (tmo == 0) {
4021                                 r = -ETIMEDOUT;
4022                                 break;
4023                         } else if (tmo < 0) {
4024                                 r = tmo;
4025                                 break;
4026                         }
4027                 } else {
4028                         fence = next;
4029                 }
4030         }
4031         mutex_unlock(&adev->shadow_list_lock);
4032
4033         if (fence)
4034                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4035         dma_fence_put(fence);
4036
4037         if (r < 0 || tmo <= 0) {
4038                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4039                 return -EIO;
4040         }
4041
4042         dev_info(adev->dev, "recover vram bo from shadow done\n");
4043         return 0;
4044 }
4045
4046
4047 /**
4048  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4049  *
4050  * @adev: amdgpu_device pointer
4051  * @from_hypervisor: request from hypervisor
4052  *
4053  * do VF FLR and reinitialize Asic
4054  * return 0 means succeeded otherwise failed
4055  */
4056 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4057                                      bool from_hypervisor)
4058 {
4059         int r;
4060
4061         if (from_hypervisor)
4062                 r = amdgpu_virt_request_full_gpu(adev, true);
4063         else
4064                 r = amdgpu_virt_reset_gpu(adev);
4065         if (r)
4066                 return r;
4067
4068         amdgpu_amdkfd_pre_reset(adev);
4069
4070         /* Resume IP prior to SMC */
4071         r = amdgpu_device_ip_reinit_early_sriov(adev);
4072         if (r)
4073                 goto error;
4074
4075         amdgpu_virt_init_data_exchange(adev);
4076         /* we need recover gart prior to run SMC/CP/SDMA resume */
4077         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4078
4079         r = amdgpu_device_fw_loading(adev);
4080         if (r)
4081                 return r;
4082
4083         /* now we are okay to resume SMC/CP/SDMA */
4084         r = amdgpu_device_ip_reinit_late_sriov(adev);
4085         if (r)
4086                 goto error;
4087
4088         amdgpu_irq_gpu_reset_resume_helper(adev);
4089         r = amdgpu_ib_ring_tests(adev);
4090         amdgpu_amdkfd_post_reset(adev);
4091
4092 error:
4093         amdgpu_virt_release_full_gpu(adev, true);
4094         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4095                 amdgpu_inc_vram_lost(adev);
4096                 r = amdgpu_device_recover_vram(adev);
4097         }
4098
4099         return r;
4100 }
4101
4102 /**
4103  * amdgpu_device_has_job_running - check if there is any job in mirror list
4104  *
4105  * @adev: amdgpu_device pointer
4106  *
4107  * check if there is any job in mirror list
4108  */
4109 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4110 {
4111         int i;
4112         struct drm_sched_job *job;
4113
4114         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4115                 struct amdgpu_ring *ring = adev->rings[i];
4116
4117                 if (!ring || !ring->sched.thread)
4118                         continue;
4119
4120                 spin_lock(&ring->sched.job_list_lock);
4121                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4122                                 struct drm_sched_job, node);
4123                 spin_unlock(&ring->sched.job_list_lock);
4124                 if (job)
4125                         return true;
4126         }
4127         return false;
4128 }
4129
4130 /**
4131  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4132  *
4133  * @adev: amdgpu_device pointer
4134  *
4135  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4136  * a hung GPU.
4137  */
4138 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4139 {
4140         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4141                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4142                 return false;
4143         }
4144
4145         if (amdgpu_gpu_recovery == 0)
4146                 goto disabled;
4147
4148         if (amdgpu_sriov_vf(adev))
4149                 return true;
4150
4151         if (amdgpu_gpu_recovery == -1) {
4152                 switch (adev->asic_type) {
4153                 case CHIP_BONAIRE:
4154                 case CHIP_HAWAII:
4155                 case CHIP_TOPAZ:
4156                 case CHIP_TONGA:
4157                 case CHIP_FIJI:
4158                 case CHIP_POLARIS10:
4159                 case CHIP_POLARIS11:
4160                 case CHIP_POLARIS12:
4161                 case CHIP_VEGAM:
4162                 case CHIP_VEGA20:
4163                 case CHIP_VEGA10:
4164                 case CHIP_VEGA12:
4165                 case CHIP_RAVEN:
4166                 case CHIP_ARCTURUS:
4167                 case CHIP_RENOIR:
4168                 case CHIP_NAVI10:
4169                 case CHIP_NAVI14:
4170                 case CHIP_NAVI12:
4171                 case CHIP_SIENNA_CICHLID:
4172                         break;
4173                 default:
4174                         goto disabled;
4175                 }
4176         }
4177
4178         return true;
4179
4180 disabled:
4181                 dev_info(adev->dev, "GPU recovery disabled.\n");
4182                 return false;
4183 }
4184
4185
4186 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4187                                         struct amdgpu_job *job,
4188                                         bool *need_full_reset_arg)
4189 {
4190         int i, r = 0;
4191         bool need_full_reset  = *need_full_reset_arg;
4192
4193         amdgpu_debugfs_wait_dump(adev);
4194
4195         if (amdgpu_sriov_vf(adev)) {
4196                 /* stop the data exchange thread */
4197                 amdgpu_virt_fini_data_exchange(adev);
4198         }
4199
4200         /* block all schedulers and reset given job's ring */
4201         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4202                 struct amdgpu_ring *ring = adev->rings[i];
4203
4204                 if (!ring || !ring->sched.thread)
4205                         continue;
4206
4207                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4208                 amdgpu_fence_driver_force_completion(ring);
4209         }
4210
4211         if(job)
4212                 drm_sched_increase_karma(&job->base);
4213
4214         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4215         if (!amdgpu_sriov_vf(adev)) {
4216
4217                 if (!need_full_reset)
4218                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4219
4220                 if (!need_full_reset) {
4221                         amdgpu_device_ip_pre_soft_reset(adev);
4222                         r = amdgpu_device_ip_soft_reset(adev);
4223                         amdgpu_device_ip_post_soft_reset(adev);
4224                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4225                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4226                                 need_full_reset = true;
4227                         }
4228                 }
4229
4230                 if (need_full_reset)
4231                         r = amdgpu_device_ip_suspend(adev);
4232
4233                 *need_full_reset_arg = need_full_reset;
4234         }
4235
4236         return r;
4237 }
4238
4239 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4240                                struct list_head *device_list_handle,
4241                                bool *need_full_reset_arg,
4242                                bool skip_hw_reset)
4243 {
4244         struct amdgpu_device *tmp_adev = NULL;
4245         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4246         int r = 0;
4247
4248         /*
4249          * ASIC reset has to be done on all HGMI hive nodes ASAP
4250          * to allow proper links negotiation in FW (within 1 sec)
4251          */
4252         if (!skip_hw_reset && need_full_reset) {
4253                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4254                         /* For XGMI run all resets in parallel to speed up the process */
4255                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4256                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4257                                         r = -EALREADY;
4258                         } else
4259                                 r = amdgpu_asic_reset(tmp_adev);
4260
4261                         if (r) {
4262                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4263                                          r, adev_to_drm(tmp_adev)->unique);
4264                                 break;
4265                         }
4266                 }
4267
4268                 /* For XGMI wait for all resets to complete before proceed */
4269                 if (!r) {
4270                         list_for_each_entry(tmp_adev, device_list_handle,
4271                                             gmc.xgmi.head) {
4272                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4273                                         flush_work(&tmp_adev->xgmi_reset_work);
4274                                         r = tmp_adev->asic_reset_res;
4275                                         if (r)
4276                                                 break;
4277                                 }
4278                         }
4279                 }
4280         }
4281
4282         if (!r && amdgpu_ras_intr_triggered()) {
4283                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4284                         if (tmp_adev->mmhub.funcs &&
4285                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4286                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4287                 }
4288
4289                 amdgpu_ras_intr_cleared();
4290         }
4291
4292         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4293                 if (need_full_reset) {
4294                         /* post card */
4295                         if (amdgpu_device_asic_init(tmp_adev))
4296                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4297
4298                         if (!r) {
4299                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4300                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4301                                 if (r)
4302                                         goto out;
4303
4304                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4305                                 if (vram_lost) {
4306                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4307                                         amdgpu_inc_vram_lost(tmp_adev);
4308                                 }
4309
4310                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4311                                 if (r)
4312                                         goto out;
4313
4314                                 r = amdgpu_device_fw_loading(tmp_adev);
4315                                 if (r)
4316                                         return r;
4317
4318                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4319                                 if (r)
4320                                         goto out;
4321
4322                                 if (vram_lost)
4323                                         amdgpu_device_fill_reset_magic(tmp_adev);
4324
4325                                 /*
4326                                  * Add this ASIC as tracked as reset was already
4327                                  * complete successfully.
4328                                  */
4329                                 amdgpu_register_gpu_instance(tmp_adev);
4330
4331                                 r = amdgpu_device_ip_late_init(tmp_adev);
4332                                 if (r)
4333                                         goto out;
4334
4335                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4336
4337                                 /*
4338                                  * The GPU enters bad state once faulty pages
4339                                  * by ECC has reached the threshold, and ras
4340                                  * recovery is scheduled next. So add one check
4341                                  * here to break recovery if it indeed exceeds
4342                                  * bad page threshold, and remind user to
4343                                  * retire this GPU or setting one bigger
4344                                  * bad_page_threshold value to fix this once
4345                                  * probing driver again.
4346                                  */
4347                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4348                                         /* must succeed. */
4349                                         amdgpu_ras_resume(tmp_adev);
4350                                 } else {
4351                                         r = -EINVAL;
4352                                         goto out;
4353                                 }
4354
4355                                 /* Update PSP FW topology after reset */
4356                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4357                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4358                         }
4359                 }
4360
4361 out:
4362                 if (!r) {
4363                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4364                         r = amdgpu_ib_ring_tests(tmp_adev);
4365                         if (r) {
4366                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4367                                 r = amdgpu_device_ip_suspend(tmp_adev);
4368                                 need_full_reset = true;
4369                                 r = -EAGAIN;
4370                                 goto end;
4371                         }
4372                 }
4373
4374                 if (!r)
4375                         r = amdgpu_device_recover_vram(tmp_adev);
4376                 else
4377                         tmp_adev->asic_reset_res = r;
4378         }
4379
4380 end:
4381         *need_full_reset_arg = need_full_reset;
4382         return r;
4383 }
4384
4385 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4386                                 struct amdgpu_hive_info *hive)
4387 {
4388         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4389                 return false;
4390
4391         if (hive) {
4392                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4393         } else {
4394                 down_write(&adev->reset_sem);
4395         }
4396
4397         atomic_inc(&adev->gpu_reset_counter);
4398         switch (amdgpu_asic_reset_method(adev)) {
4399         case AMD_RESET_METHOD_MODE1:
4400                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4401                 break;
4402         case AMD_RESET_METHOD_MODE2:
4403                 adev->mp1_state = PP_MP1_STATE_RESET;
4404                 break;
4405         default:
4406                 adev->mp1_state = PP_MP1_STATE_NONE;
4407                 break;
4408         }
4409
4410         return true;
4411 }
4412
4413 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4414 {
4415         amdgpu_vf_error_trans_all(adev);
4416         adev->mp1_state = PP_MP1_STATE_NONE;
4417         atomic_set(&adev->in_gpu_reset, 0);
4418         up_write(&adev->reset_sem);
4419 }
4420
4421 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4422 {
4423         struct pci_dev *p = NULL;
4424
4425         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4426                         adev->pdev->bus->number, 1);
4427         if (p) {
4428                 pm_runtime_enable(&(p->dev));
4429                 pm_runtime_resume(&(p->dev));
4430         }
4431 }
4432
4433 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4434 {
4435         enum amd_reset_method reset_method;
4436         struct pci_dev *p = NULL;
4437         u64 expires;
4438
4439         /*
4440          * For now, only BACO and mode1 reset are confirmed
4441          * to suffer the audio issue without proper suspended.
4442          */
4443         reset_method = amdgpu_asic_reset_method(adev);
4444         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4445              (reset_method != AMD_RESET_METHOD_MODE1))
4446                 return -EINVAL;
4447
4448         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4449                         adev->pdev->bus->number, 1);
4450         if (!p)
4451                 return -ENODEV;
4452
4453         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4454         if (!expires)
4455                 /*
4456                  * If we cannot get the audio device autosuspend delay,
4457                  * a fixed 4S interval will be used. Considering 3S is
4458                  * the audio controller default autosuspend delay setting.
4459                  * 4S used here is guaranteed to cover that.
4460                  */
4461                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4462
4463         while (!pm_runtime_status_suspended(&(p->dev))) {
4464                 if (!pm_runtime_suspend(&(p->dev)))
4465                         break;
4466
4467                 if (expires < ktime_get_mono_fast_ns()) {
4468                         dev_warn(adev->dev, "failed to suspend display audio\n");
4469                         /* TODO: abort the succeeding gpu reset? */
4470                         return -ETIMEDOUT;
4471                 }
4472         }
4473
4474         pm_runtime_disable(&(p->dev));
4475
4476         return 0;
4477 }
4478
4479 /**
4480  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4481  *
4482  * @adev: amdgpu_device pointer
4483  * @job: which job trigger hang
4484  *
4485  * Attempt to reset the GPU if it has hung (all asics).
4486  * Attempt to do soft-reset or full-reset and reinitialize Asic
4487  * Returns 0 for success or an error on failure.
4488  */
4489
4490 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4491                               struct amdgpu_job *job)
4492 {
4493         struct list_head device_list, *device_list_handle =  NULL;
4494         bool need_full_reset = false;
4495         bool job_signaled = false;
4496         struct amdgpu_hive_info *hive = NULL;
4497         struct amdgpu_device *tmp_adev = NULL;
4498         int i, r = 0;
4499         bool need_emergency_restart = false;
4500         bool audio_suspended = false;
4501
4502         /*
4503          * Special case: RAS triggered and full reset isn't supported
4504          */
4505         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4506
4507         /*
4508          * Flush RAM to disk so that after reboot
4509          * the user can read log and see why the system rebooted.
4510          */
4511         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4512                 DRM_WARN("Emergency reboot.");
4513
4514                 ksys_sync_helper();
4515                 emergency_restart();
4516         }
4517
4518         dev_info(adev->dev, "GPU %s begin!\n",
4519                 need_emergency_restart ? "jobs stop":"reset");
4520
4521         /*
4522          * Here we trylock to avoid chain of resets executing from
4523          * either trigger by jobs on different adevs in XGMI hive or jobs on
4524          * different schedulers for same device while this TO handler is running.
4525          * We always reset all schedulers for device and all devices for XGMI
4526          * hive so that should take care of them too.
4527          */
4528         hive = amdgpu_get_xgmi_hive(adev);
4529         if (hive) {
4530                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4531                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4532                                 job ? job->base.id : -1, hive->hive_id);
4533                         amdgpu_put_xgmi_hive(hive);
4534                         return 0;
4535                 }
4536                 mutex_lock(&hive->hive_lock);
4537         }
4538
4539         /*
4540          * Build list of devices to reset.
4541          * In case we are in XGMI hive mode, resort the device list
4542          * to put adev in the 1st position.
4543          */
4544         INIT_LIST_HEAD(&device_list);
4545         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4546                 if (!hive)
4547                         return -ENODEV;
4548                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4549                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4550                 device_list_handle = &hive->device_list;
4551         } else {
4552                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4553                 device_list_handle = &device_list;
4554         }
4555
4556         /* block all schedulers and reset given job's ring */
4557         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4558                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4559                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4560                                   job ? job->base.id : -1);
4561                         r = 0;
4562                         goto skip_recovery;
4563                 }
4564
4565                 /*
4566                  * Try to put the audio codec into suspend state
4567                  * before gpu reset started.
4568                  *
4569                  * Due to the power domain of the graphics device
4570                  * is shared with AZ power domain. Without this,
4571                  * we may change the audio hardware from behind
4572                  * the audio driver's back. That will trigger
4573                  * some audio codec errors.
4574                  */
4575                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4576                         audio_suspended = true;
4577
4578                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4579
4580                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4581
4582                 if (!amdgpu_sriov_vf(tmp_adev))
4583                         amdgpu_amdkfd_pre_reset(tmp_adev);
4584
4585                 /*
4586                  * Mark these ASICs to be reseted as untracked first
4587                  * And add them back after reset completed
4588                  */
4589                 amdgpu_unregister_gpu_instance(tmp_adev);
4590
4591                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4592
4593                 /* disable ras on ALL IPs */
4594                 if (!need_emergency_restart &&
4595                       amdgpu_device_ip_need_full_reset(tmp_adev))
4596                         amdgpu_ras_suspend(tmp_adev);
4597
4598                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4599                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4600
4601                         if (!ring || !ring->sched.thread)
4602                                 continue;
4603
4604                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4605
4606                         if (need_emergency_restart)
4607                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4608                 }
4609         }
4610
4611         if (need_emergency_restart)
4612                 goto skip_sched_resume;
4613
4614         /*
4615          * Must check guilty signal here since after this point all old
4616          * HW fences are force signaled.
4617          *
4618          * job->base holds a reference to parent fence
4619          */
4620         if (job && job->base.s_fence->parent &&
4621             dma_fence_is_signaled(job->base.s_fence->parent)) {
4622                 job_signaled = true;
4623                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4624                 goto skip_hw_reset;
4625         }
4626
4627 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4628         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4629                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4630                                                  (tmp_adev == adev) ? job : NULL,
4631                                                  &need_full_reset);
4632                 /*TODO Should we stop ?*/
4633                 if (r) {
4634                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4635                                   r, adev_to_drm(tmp_adev)->unique);
4636                         tmp_adev->asic_reset_res = r;
4637                 }
4638         }
4639
4640         /* Actual ASIC resets if needed.*/
4641         /* TODO Implement XGMI hive reset logic for SRIOV */
4642         if (amdgpu_sriov_vf(adev)) {
4643                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4644                 if (r)
4645                         adev->asic_reset_res = r;
4646         } else {
4647                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4648                 if (r && r == -EAGAIN)
4649                         goto retry;
4650         }
4651
4652 skip_hw_reset:
4653
4654         /* Post ASIC reset for all devs .*/
4655         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4656
4657                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4658                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4659
4660                         if (!ring || !ring->sched.thread)
4661                                 continue;
4662
4663                         /* No point to resubmit jobs if we didn't HW reset*/
4664                         if (!tmp_adev->asic_reset_res && !job_signaled)
4665                                 drm_sched_resubmit_jobs(&ring->sched);
4666
4667                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4668                 }
4669
4670                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4671                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4672                 }
4673
4674                 tmp_adev->asic_reset_res = 0;
4675
4676                 if (r) {
4677                         /* bad news, how to tell it to userspace ? */
4678                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4679                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4680                 } else {
4681                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4682                 }
4683         }
4684
4685 skip_sched_resume:
4686         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4687                 /*unlock kfd: SRIOV would do it separately */
4688                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4689                         amdgpu_amdkfd_post_reset(tmp_adev);
4690                 if (audio_suspended)
4691                         amdgpu_device_resume_display_audio(tmp_adev);
4692                 amdgpu_device_unlock_adev(tmp_adev);
4693         }
4694
4695 skip_recovery:
4696         if (hive) {
4697                 atomic_set(&hive->in_reset, 0);
4698                 mutex_unlock(&hive->hive_lock);
4699                 amdgpu_put_xgmi_hive(hive);
4700         }
4701
4702         if (r)
4703                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4704         return r;
4705 }
4706
4707 /**
4708  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4709  *
4710  * @adev: amdgpu_device pointer
4711  *
4712  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4713  * and lanes) of the slot the device is in. Handles APUs and
4714  * virtualized environments where PCIE config space may not be available.
4715  */
4716 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4717 {
4718         struct pci_dev *pdev;
4719         enum pci_bus_speed speed_cap, platform_speed_cap;
4720         enum pcie_link_width platform_link_width;
4721
4722         if (amdgpu_pcie_gen_cap)
4723                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4724
4725         if (amdgpu_pcie_lane_cap)
4726                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4727
4728         /* covers APUs as well */
4729         if (pci_is_root_bus(adev->pdev->bus)) {
4730                 if (adev->pm.pcie_gen_mask == 0)
4731                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4732                 if (adev->pm.pcie_mlw_mask == 0)
4733                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4734                 return;
4735         }
4736
4737         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4738                 return;
4739
4740         pcie_bandwidth_available(adev->pdev, NULL,
4741                                  &platform_speed_cap, &platform_link_width);
4742
4743         if (adev->pm.pcie_gen_mask == 0) {
4744                 /* asic caps */
4745                 pdev = adev->pdev;
4746                 speed_cap = pcie_get_speed_cap(pdev);
4747                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4748                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4749                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4750                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4751                 } else {
4752                         if (speed_cap == PCIE_SPEED_16_0GT)
4753                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4754                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4755                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4756                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4757                         else if (speed_cap == PCIE_SPEED_8_0GT)
4758                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4759                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4760                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4761                         else if (speed_cap == PCIE_SPEED_5_0GT)
4762                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4763                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4764                         else
4765                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4766                 }
4767                 /* platform caps */
4768                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4769                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4770                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4771                 } else {
4772                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4773                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4774                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4775                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4776                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4777                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4778                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4779                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4780                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4781                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4782                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4784                         else
4785                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4786
4787                 }
4788         }
4789         if (adev->pm.pcie_mlw_mask == 0) {
4790                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4791                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4792                 } else {
4793                         switch (platform_link_width) {
4794                         case PCIE_LNK_X32:
4795                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4796                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4797                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4798                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4799                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4800                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4801                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4802                                 break;
4803                         case PCIE_LNK_X16:
4804                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4805                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4806                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4807                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4808                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4809                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4810                                 break;
4811                         case PCIE_LNK_X12:
4812                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4813                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4814                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4815                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4817                                 break;
4818                         case PCIE_LNK_X8:
4819                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4820                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4821                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4822                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4823                                 break;
4824                         case PCIE_LNK_X4:
4825                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4826                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4827                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4828                                 break;
4829                         case PCIE_LNK_X2:
4830                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4831                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4832                                 break;
4833                         case PCIE_LNK_X1:
4834                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4835                                 break;
4836                         default:
4837                                 break;
4838                         }
4839                 }
4840         }
4841 }
4842
4843 int amdgpu_device_baco_enter(struct drm_device *dev)
4844 {
4845         struct amdgpu_device *adev = drm_to_adev(dev);
4846         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4847
4848         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4849                 return -ENOTSUPP;
4850
4851         if (ras && ras->supported)
4852                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4853
4854         return amdgpu_dpm_baco_enter(adev);
4855 }
4856
4857 int amdgpu_device_baco_exit(struct drm_device *dev)
4858 {
4859         struct amdgpu_device *adev = drm_to_adev(dev);
4860         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4861         int ret = 0;
4862
4863         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4864                 return -ENOTSUPP;
4865
4866         ret = amdgpu_dpm_baco_exit(adev);
4867         if (ret)
4868                 return ret;
4869
4870         if (ras && ras->supported)
4871                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4872
4873         return 0;
4874 }
4875
4876 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4877 {
4878         int i;
4879
4880         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4881                 struct amdgpu_ring *ring = adev->rings[i];
4882
4883                 if (!ring || !ring->sched.thread)
4884                         continue;
4885
4886                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4887         }
4888 }
4889
4890 /**
4891  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4892  * @pdev: PCI device struct
4893  * @state: PCI channel state
4894  *
4895  * Description: Called when a PCI error is detected.
4896  *
4897  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4898  */
4899 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4900 {
4901         struct drm_device *dev = pci_get_drvdata(pdev);
4902         struct amdgpu_device *adev = drm_to_adev(dev);
4903         int i;
4904
4905         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4906
4907         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4908                 DRM_WARN("No support for XGMI hive yet...");
4909                 return PCI_ERS_RESULT_DISCONNECT;
4910         }
4911
4912         switch (state) {
4913         case pci_channel_io_normal:
4914                 return PCI_ERS_RESULT_CAN_RECOVER;
4915         /* Fatal error, prepare for slot reset */
4916         case pci_channel_io_frozen:             
4917                 /*              
4918                  * Cancel and wait for all TDRs in progress if failing to
4919                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4920                  *
4921                  * Locking adev->reset_sem will prevent any external access
4922                  * to GPU during PCI error recovery
4923                  */
4924                 while (!amdgpu_device_lock_adev(adev, NULL))
4925                         amdgpu_cancel_all_tdr(adev);
4926
4927                 /*
4928                  * Block any work scheduling as we do for regular GPU reset
4929                  * for the duration of the recovery
4930                  */
4931                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4932                         struct amdgpu_ring *ring = adev->rings[i];
4933
4934                         if (!ring || !ring->sched.thread)
4935                                 continue;
4936
4937                         drm_sched_stop(&ring->sched, NULL);
4938                 }
4939                 return PCI_ERS_RESULT_NEED_RESET;
4940         case pci_channel_io_perm_failure:
4941                 /* Permanent error, prepare for device removal */
4942                 return PCI_ERS_RESULT_DISCONNECT;
4943         }
4944
4945         return PCI_ERS_RESULT_NEED_RESET;
4946 }
4947
4948 /**
4949  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4950  * @pdev: pointer to PCI device
4951  */
4952 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4953 {
4954
4955         DRM_INFO("PCI error: mmio enabled callback!!\n");
4956
4957         /* TODO - dump whatever for debugging purposes */
4958
4959         /* This called only if amdgpu_pci_error_detected returns
4960          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4961          * works, no need to reset slot.
4962          */
4963
4964         return PCI_ERS_RESULT_RECOVERED;
4965 }
4966
4967 /**
4968  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4969  * @pdev: PCI device struct
4970  *
4971  * Description: This routine is called by the pci error recovery
4972  * code after the PCI slot has been reset, just before we
4973  * should resume normal operations.
4974  */
4975 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4976 {
4977         struct drm_device *dev = pci_get_drvdata(pdev);
4978         struct amdgpu_device *adev = drm_to_adev(dev);
4979         int r, i;
4980         bool need_full_reset = true;
4981         u32 memsize;
4982         struct list_head device_list;
4983
4984         DRM_INFO("PCI error: slot reset callback!!\n");
4985
4986         INIT_LIST_HEAD(&device_list);
4987         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4988
4989         /* wait for asic to come out of reset */
4990         msleep(500);
4991
4992         /* Restore PCI confspace */
4993         amdgpu_device_load_pci_state(pdev);
4994
4995         /* confirm  ASIC came out of reset */
4996         for (i = 0; i < adev->usec_timeout; i++) {
4997                 memsize = amdgpu_asic_get_config_memsize(adev);
4998
4999                 if (memsize != 0xffffffff)
5000                         break;
5001                 udelay(1);
5002         }
5003         if (memsize == 0xffffffff) {
5004                 r = -ETIME;
5005                 goto out;
5006         }
5007
5008         adev->in_pci_err_recovery = true;       
5009         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5010         adev->in_pci_err_recovery = false;
5011         if (r)
5012                 goto out;
5013
5014         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5015
5016 out:
5017         if (!r) {
5018                 if (amdgpu_device_cache_pci_state(adev->pdev))
5019                         pci_restore_state(adev->pdev);
5020
5021                 DRM_INFO("PCIe error recovery succeeded\n");
5022         } else {
5023                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5024                 amdgpu_device_unlock_adev(adev);
5025         }
5026
5027         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5028 }
5029
5030 /**
5031  * amdgpu_pci_resume() - resume normal ops after PCI reset
5032  * @pdev: pointer to PCI device
5033  *
5034  * Called when the error recovery driver tells us that its
5035  * OK to resume normal operation. Use completion to allow
5036  * halted scsi ops to resume.
5037  */
5038 void amdgpu_pci_resume(struct pci_dev *pdev)
5039 {
5040         struct drm_device *dev = pci_get_drvdata(pdev);
5041         struct amdgpu_device *adev = drm_to_adev(dev);
5042         int i;
5043
5044
5045         DRM_INFO("PCI error: resume callback!!\n");
5046
5047         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5048                 struct amdgpu_ring *ring = adev->rings[i];
5049
5050                 if (!ring || !ring->sched.thread)
5051                         continue;
5052
5053
5054                 drm_sched_resubmit_jobs(&ring->sched);
5055                 drm_sched_start(&ring->sched, true);
5056         }
5057
5058         amdgpu_device_unlock_adev(adev);
5059 }
5060
5061 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5062 {
5063         struct drm_device *dev = pci_get_drvdata(pdev);
5064         struct amdgpu_device *adev = drm_to_adev(dev);
5065         int r;
5066
5067         r = pci_save_state(pdev);
5068         if (!r) {
5069                 kfree(adev->pci_state);
5070
5071                 adev->pci_state = pci_store_saved_state(pdev);
5072
5073                 if (!adev->pci_state) {
5074                         DRM_ERROR("Failed to store PCI saved state");
5075                         return false;
5076                 }
5077         } else {
5078                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5079                 return false;
5080         }
5081
5082         return true;
5083 }
5084
5085 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5086 {
5087         struct drm_device *dev = pci_get_drvdata(pdev);
5088         struct amdgpu_device *adev = drm_to_adev(dev);
5089         int r;
5090
5091         if (!adev->pci_state)
5092                 return false;
5093
5094         r = pci_load_saved_state(pdev, adev->pci_state);
5095
5096         if (!r) {
5097                 pci_restore_state(pdev);
5098         } else {
5099                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5100                 return false;
5101         }
5102
5103         return true;
5104 }
5105
5106
This page took 0.339905 seconds and 4 git commands to generate.