]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm/amdgpu: support indirect access reg outside of mmio bar (v2)
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83
84 #define AMDGPU_RESUME_MS                2000
85
86 const char *amdgpu_asic_name[] = {
87         "TAHITI",
88         "PITCAIRN",
89         "VERDE",
90         "OLAND",
91         "HAINAN",
92         "BONAIRE",
93         "KAVERI",
94         "KABINI",
95         "HAWAII",
96         "MULLINS",
97         "TOPAZ",
98         "TONGA",
99         "FIJI",
100         "CARRIZO",
101         "STONEY",
102         "POLARIS10",
103         "POLARIS11",
104         "POLARIS12",
105         "VEGAM",
106         "VEGA10",
107         "VEGA12",
108         "VEGA20",
109         "RAVEN",
110         "ARCTURUS",
111         "RENOIR",
112         "NAVI10",
113         "NAVI14",
114         "NAVI12",
115         "SIENNA_CICHLID",
116         "NAVY_FLOUNDER",
117         "LAST",
118 };
119
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128
129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130                 struct device_attribute *attr, char *buf)
131 {
132         struct drm_device *ddev = dev_get_drvdata(dev);
133         struct amdgpu_device *adev = drm_to_adev(ddev);
134         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140                 amdgpu_device_get_pcie_replay_count, NULL);
141
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153
154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155                 struct device_attribute *attr, char *buf)
156 {
157         struct drm_device *ddev = dev_get_drvdata(dev);
158         struct amdgpu_device *adev = drm_to_adev(ddev);
159
160         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162
163 static DEVICE_ATTR(product_name, S_IRUGO,
164                 amdgpu_device_get_product_name, NULL);
165
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175
176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177                 struct device_attribute *attr, char *buf)
178 {
179         struct drm_device *ddev = dev_get_drvdata(dev);
180         struct amdgpu_device *adev = drm_to_adev(ddev);
181
182         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184
185 static DEVICE_ATTR(product_number, S_IRUGO,
186                 amdgpu_device_get_product_number, NULL);
187
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197
198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199                 struct device_attribute *attr, char *buf)
200 {
201         struct drm_device *ddev = dev_get_drvdata(dev);
202         struct amdgpu_device *adev = drm_to_adev(ddev);
203
204         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208                 amdgpu_device_get_serial_number, NULL);
209
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220         struct amdgpu_device *adev = drm_to_adev(dev);
221
222         if (adev->flags & AMD_IS_PX)
223                 return true;
224         return false;
225 }
226
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237         struct amdgpu_device *adev = drm_to_adev(dev);
238
239         return amdgpu_asic_supports_baco(adev);
240 }
241
242 /**
243  * VRAM access helper functions.
244  *
245  * amdgpu_device_vram_access - read/write a buffer in vram
246  *
247  * @adev: amdgpu_device pointer
248  * @pos: offset of the buffer in vram
249  * @buf: virtual address of the buffer in system memory
250  * @size: read/write size, sizeof(@buf) must > @size
251  * @write: true - write to vram, otherwise - read from vram
252  */
253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
254                                uint32_t *buf, size_t size, bool write)
255 {
256         unsigned long flags;
257         uint32_t hi = ~0;
258         uint64_t last;
259
260
261 #ifdef CONFIG_64BIT
262         last = min(pos + size, adev->gmc.visible_vram_size);
263         if (last > pos) {
264                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
265                 size_t count = last - pos;
266
267                 if (write) {
268                         memcpy_toio(addr, buf, count);
269                         mb();
270                         amdgpu_asic_flush_hdp(adev, NULL);
271                 } else {
272                         amdgpu_asic_invalidate_hdp(adev, NULL);
273                         mb();
274                         memcpy_fromio(buf, addr, count);
275                 }
276
277                 if (count == size)
278                         return;
279
280                 pos += count;
281                 buf += count / 4;
282                 size -= count;
283         }
284 #endif
285
286         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
287         for (last = pos + size; pos < last; pos += 4) {
288                 uint32_t tmp = pos >> 31;
289
290                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
291                 if (tmp != hi) {
292                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
293                         hi = tmp;
294                 }
295                 if (write)
296                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
297                 else
298                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
299         }
300         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
301 }
302
303 /*
304  * register access helper functions.
305  */
306 /**
307  * amdgpu_device_rreg - read a memory mapped IO or indirect register
308  *
309  * @adev: amdgpu_device pointer
310  * @reg: dword aligned register offset
311  * @acc_flags: access flags which require special behavior
312  *
313  * Returns the 32 bit value from the offset specified.
314  */
315 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
316                             uint32_t reg, uint32_t acc_flags)
317 {
318         uint32_t ret;
319
320         if (adev->in_pci_err_recovery)
321                 return 0;
322
323         if ((reg * 4) < adev->rmmio_size) {
324                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
325                     amdgpu_sriov_runtime(adev) &&
326                     down_read_trylock(&adev->reset_sem)) {
327                         ret = amdgpu_kiq_rreg(adev, reg);
328                         up_read(&adev->reset_sem);
329                 } else {
330                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
331                 }
332         } else {
333                 ret = adev->pcie_rreg(adev, reg * 4);
334         }
335
336         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
337
338         return ret;
339 }
340
341 /*
342  * MMIO register read with bytes helper functions
343  * @offset:bytes offset from MMIO start
344  *
345 */
346
347 /**
348  * amdgpu_mm_rreg8 - read a memory mapped IO register
349  *
350  * @adev: amdgpu_device pointer
351  * @offset: byte aligned register offset
352  *
353  * Returns the 8 bit value from the offset specified.
354  */
355 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
356 {
357         if (adev->in_pci_err_recovery)
358                 return 0;
359
360         if (offset < adev->rmmio_size)
361                 return (readb(adev->rmmio + offset));
362         BUG();
363 }
364
365 /*
366  * MMIO register write with bytes helper functions
367  * @offset:bytes offset from MMIO start
368  * @value: the value want to be written to the register
369  *
370 */
371 /**
372  * amdgpu_mm_wreg8 - read a memory mapped IO register
373  *
374  * @adev: amdgpu_device pointer
375  * @offset: byte aligned register offset
376  * @value: 8 bit value to write
377  *
378  * Writes the value specified to the offset specified.
379  */
380 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
381 {
382         if (adev->in_pci_err_recovery)
383                 return;
384
385         if (offset < adev->rmmio_size)
386                 writeb(value, adev->rmmio + offset);
387         else
388                 BUG();
389 }
390
391 /**
392  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
393  *
394  * @adev: amdgpu_device pointer
395  * @reg: dword aligned register offset
396  * @v: 32 bit value to write to the register
397  * @acc_flags: access flags which require special behavior
398  *
399  * Writes the value specified to the offset specified.
400  */
401 void amdgpu_device_wreg(struct amdgpu_device *adev,
402                         uint32_t reg, uint32_t v,
403                         uint32_t acc_flags)
404 {
405         if (adev->in_pci_err_recovery)
406                 return;
407
408         if ((reg * 4) < adev->rmmio_size) {
409                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
410                     amdgpu_sriov_runtime(adev) &&
411                     down_read_trylock(&adev->reset_sem)) {
412                         amdgpu_kiq_wreg(adev, reg, v);
413                         up_read(&adev->reset_sem);
414                 } else {
415                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
416                 }
417         } else {
418                 adev->pcie_wreg(adev, reg * 4, v);
419         }
420
421         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
422 }
423
424 /*
425  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
426  *
427  * this function is invoked only the debugfs register access
428  * */
429 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
430                              uint32_t reg, uint32_t v)
431 {
432         if (adev->in_pci_err_recovery)
433                 return;
434
435         if (amdgpu_sriov_fullaccess(adev) &&
436             adev->gfx.rlc.funcs &&
437             adev->gfx.rlc.funcs->is_rlcg_access_range) {
438                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
439                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
440         } else {
441                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
442         }
443 }
444
445 /**
446  * amdgpu_io_rreg - read an IO register
447  *
448  * @adev: amdgpu_device pointer
449  * @reg: dword aligned register offset
450  *
451  * Returns the 32 bit value from the offset specified.
452  */
453 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
454 {
455         if (adev->in_pci_err_recovery)
456                 return 0;
457
458         if ((reg * 4) < adev->rio_mem_size)
459                 return ioread32(adev->rio_mem + (reg * 4));
460         else {
461                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
462                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
463         }
464 }
465
466 /**
467  * amdgpu_io_wreg - write to an IO register
468  *
469  * @adev: amdgpu_device pointer
470  * @reg: dword aligned register offset
471  * @v: 32 bit value to write to the register
472  *
473  * Writes the value specified to the offset specified.
474  */
475 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
476 {
477         if (adev->in_pci_err_recovery)
478                 return;
479
480         if ((reg * 4) < adev->rio_mem_size)
481                 iowrite32(v, adev->rio_mem + (reg * 4));
482         else {
483                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
484                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
485         }
486 }
487
488 /**
489  * amdgpu_mm_rdoorbell - read a doorbell dword
490  *
491  * @adev: amdgpu_device pointer
492  * @index: doorbell index
493  *
494  * Returns the value in the doorbell aperture at the
495  * requested doorbell index (CIK).
496  */
497 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
498 {
499         if (adev->in_pci_err_recovery)
500                 return 0;
501
502         if (index < adev->doorbell.num_doorbells) {
503                 return readl(adev->doorbell.ptr + index);
504         } else {
505                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
506                 return 0;
507         }
508 }
509
510 /**
511  * amdgpu_mm_wdoorbell - write a doorbell dword
512  *
513  * @adev: amdgpu_device pointer
514  * @index: doorbell index
515  * @v: value to write
516  *
517  * Writes @v to the doorbell aperture at the
518  * requested doorbell index (CIK).
519  */
520 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
521 {
522         if (adev->in_pci_err_recovery)
523                 return;
524
525         if (index < adev->doorbell.num_doorbells) {
526                 writel(v, adev->doorbell.ptr + index);
527         } else {
528                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
529         }
530 }
531
532 /**
533  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
534  *
535  * @adev: amdgpu_device pointer
536  * @index: doorbell index
537  *
538  * Returns the value in the doorbell aperture at the
539  * requested doorbell index (VEGA10+).
540  */
541 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
542 {
543         if (adev->in_pci_err_recovery)
544                 return 0;
545
546         if (index < adev->doorbell.num_doorbells) {
547                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
548         } else {
549                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
550                 return 0;
551         }
552 }
553
554 /**
555  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
556  *
557  * @adev: amdgpu_device pointer
558  * @index: doorbell index
559  * @v: value to write
560  *
561  * Writes @v to the doorbell aperture at the
562  * requested doorbell index (VEGA10+).
563  */
564 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
565 {
566         if (adev->in_pci_err_recovery)
567                 return;
568
569         if (index < adev->doorbell.num_doorbells) {
570                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
571         } else {
572                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
573         }
574 }
575
576 /**
577  * amdgpu_device_indirect_rreg - read an indirect register
578  *
579  * @adev: amdgpu_device pointer
580  * @pcie_index: mmio register offset
581  * @pcie_data: mmio register offset
582  *
583  * Returns the value of indirect register @reg_addr
584  */
585 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
586                                 u32 pcie_index, u32 pcie_data,
587                                 u32 reg_addr)
588 {
589         unsigned long flags;
590         u32 r;
591         void __iomem *pcie_index_offset;
592         void __iomem *pcie_data_offset;
593
594         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
595         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
596         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
597
598         writel(reg_addr, pcie_index_offset);
599         readl(pcie_index_offset);
600         r = readl(pcie_data_offset);
601         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603         return r;
604 }
605
606 /**
607  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608  *
609  * @adev: amdgpu_device pointer
610  * @pcie_index: mmio register offset
611  * @pcie_data: mmio register offset
612  *
613  * Returns the value of indirect register @reg_addr
614  */
615 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
616                                   u32 pcie_index, u32 pcie_data,
617                                   u32 reg_addr)
618 {
619         unsigned long flags;
620         u64 r;
621         void __iomem *pcie_index_offset;
622         void __iomem *pcie_data_offset;
623
624         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
625         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
626         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
627
628         /* read low 32 bits */
629         writel(reg_addr, pcie_index_offset);
630         readl(pcie_index_offset);
631         r = readl(pcie_data_offset);
632         /* read high 32 bits */
633         writel(reg_addr + 4, pcie_index_offset);
634         readl(pcie_index_offset);
635         r |= ((u64)readl(pcie_data_offset) << 32);
636         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
637
638         return r;
639 }
640
641 /**
642  * amdgpu_device_indirect_wreg - write an indirect register address
643  *
644  * @adev: amdgpu_device pointer
645  * @pcie_index: mmio register offset
646  * @pcie_data: mmio register offset
647  * @reg_addr: indirect register offset
648  * @reg_data: indirect register data
649  *
650  */
651 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
652                                  u32 pcie_index, u32 pcie_data,
653                                  u32 reg_addr, u32 reg_data)
654 {
655         unsigned long flags;
656         void __iomem *pcie_index_offset;
657         void __iomem *pcie_data_offset;
658
659         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
660         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
661         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
662
663         writel(reg_addr, pcie_index_offset);
664         readl(pcie_index_offset);
665         writel(reg_data, pcie_data_offset);
666         readl(pcie_data_offset);
667         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
668 }
669
670 /**
671  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
672  *
673  * @adev: amdgpu_device pointer
674  * @pcie_index: mmio register offset
675  * @pcie_data: mmio register offset
676  * @reg_addr: indirect register offset
677  * @reg_data: indirect register data
678  *
679  */
680 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
681                                    u32 pcie_index, u32 pcie_data,
682                                    u32 reg_addr, u64 reg_data)
683 {
684         unsigned long flags;
685         void __iomem *pcie_index_offset;
686         void __iomem *pcie_data_offset;
687
688         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691
692         /* write low 32 bits */
693         writel(reg_addr, pcie_index_offset);
694         readl(pcie_index_offset);
695         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
696         readl(pcie_data_offset);
697         /* write high 32 bits */
698         writel(reg_addr + 4, pcie_index_offset);
699         readl(pcie_index_offset);
700         writel((u32)(reg_data >> 32), pcie_data_offset);
701         readl(pcie_data_offset);
702         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
703 }
704
705 /**
706  * amdgpu_invalid_rreg - dummy reg read function
707  *
708  * @adev: amdgpu device pointer
709  * @reg: offset of register
710  *
711  * Dummy register read function.  Used for register blocks
712  * that certain asics don't have (all asics).
713  * Returns the value in the register.
714  */
715 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
716 {
717         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
718         BUG();
719         return 0;
720 }
721
722 /**
723  * amdgpu_invalid_wreg - dummy reg write function
724  *
725  * @adev: amdgpu device pointer
726  * @reg: offset of register
727  * @v: value to write to the register
728  *
729  * Dummy register read function.  Used for register blocks
730  * that certain asics don't have (all asics).
731  */
732 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
733 {
734         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
735                   reg, v);
736         BUG();
737 }
738
739 /**
740  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
741  *
742  * @adev: amdgpu device pointer
743  * @reg: offset of register
744  *
745  * Dummy register read function.  Used for register blocks
746  * that certain asics don't have (all asics).
747  * Returns the value in the register.
748  */
749 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
750 {
751         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
752         BUG();
753         return 0;
754 }
755
756 /**
757  * amdgpu_invalid_wreg64 - dummy reg write function
758  *
759  * @adev: amdgpu device pointer
760  * @reg: offset of register
761  * @v: value to write to the register
762  *
763  * Dummy register read function.  Used for register blocks
764  * that certain asics don't have (all asics).
765  */
766 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
767 {
768         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
769                   reg, v);
770         BUG();
771 }
772
773 /**
774  * amdgpu_block_invalid_rreg - dummy reg read function
775  *
776  * @adev: amdgpu device pointer
777  * @block: offset of instance
778  * @reg: offset of register
779  *
780  * Dummy register read function.  Used for register blocks
781  * that certain asics don't have (all asics).
782  * Returns the value in the register.
783  */
784 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
785                                           uint32_t block, uint32_t reg)
786 {
787         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
788                   reg, block);
789         BUG();
790         return 0;
791 }
792
793 /**
794  * amdgpu_block_invalid_wreg - dummy reg write function
795  *
796  * @adev: amdgpu device pointer
797  * @block: offset of instance
798  * @reg: offset of register
799  * @v: value to write to the register
800  *
801  * Dummy register read function.  Used for register blocks
802  * that certain asics don't have (all asics).
803  */
804 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
805                                       uint32_t block,
806                                       uint32_t reg, uint32_t v)
807 {
808         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
809                   reg, block, v);
810         BUG();
811 }
812
813 /**
814  * amdgpu_device_asic_init - Wrapper for atom asic_init
815  *
816  * @dev: drm_device pointer
817  *
818  * Does any asic specific work and then calls atom asic init.
819  */
820 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
821 {
822         amdgpu_asic_pre_asic_init(adev);
823
824         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
825 }
826
827 /**
828  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
829  *
830  * @adev: amdgpu device pointer
831  *
832  * Allocates a scratch page of VRAM for use by various things in the
833  * driver.
834  */
835 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
836 {
837         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
838                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
839                                        &adev->vram_scratch.robj,
840                                        &adev->vram_scratch.gpu_addr,
841                                        (void **)&adev->vram_scratch.ptr);
842 }
843
844 /**
845  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
846  *
847  * @adev: amdgpu device pointer
848  *
849  * Frees the VRAM scratch page.
850  */
851 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
852 {
853         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
854 }
855
856 /**
857  * amdgpu_device_program_register_sequence - program an array of registers.
858  *
859  * @adev: amdgpu_device pointer
860  * @registers: pointer to the register array
861  * @array_size: size of the register array
862  *
863  * Programs an array or registers with and and or masks.
864  * This is a helper for setting golden registers.
865  */
866 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
867                                              const u32 *registers,
868                                              const u32 array_size)
869 {
870         u32 tmp, reg, and_mask, or_mask;
871         int i;
872
873         if (array_size % 3)
874                 return;
875
876         for (i = 0; i < array_size; i +=3) {
877                 reg = registers[i + 0];
878                 and_mask = registers[i + 1];
879                 or_mask = registers[i + 2];
880
881                 if (and_mask == 0xffffffff) {
882                         tmp = or_mask;
883                 } else {
884                         tmp = RREG32(reg);
885                         tmp &= ~and_mask;
886                         if (adev->family >= AMDGPU_FAMILY_AI)
887                                 tmp |= (or_mask & and_mask);
888                         else
889                                 tmp |= or_mask;
890                 }
891                 WREG32(reg, tmp);
892         }
893 }
894
895 /**
896  * amdgpu_device_pci_config_reset - reset the GPU
897  *
898  * @adev: amdgpu_device pointer
899  *
900  * Resets the GPU using the pci config reset sequence.
901  * Only applicable to asics prior to vega10.
902  */
903 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
904 {
905         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
906 }
907
908 /*
909  * GPU doorbell aperture helpers function.
910  */
911 /**
912  * amdgpu_device_doorbell_init - Init doorbell driver information.
913  *
914  * @adev: amdgpu_device pointer
915  *
916  * Init doorbell driver information (CIK)
917  * Returns 0 on success, error on failure.
918  */
919 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
920 {
921
922         /* No doorbell on SI hardware generation */
923         if (adev->asic_type < CHIP_BONAIRE) {
924                 adev->doorbell.base = 0;
925                 adev->doorbell.size = 0;
926                 adev->doorbell.num_doorbells = 0;
927                 adev->doorbell.ptr = NULL;
928                 return 0;
929         }
930
931         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
932                 return -EINVAL;
933
934         amdgpu_asic_init_doorbell_index(adev);
935
936         /* doorbell bar mapping */
937         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
938         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
939
940         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
941                                              adev->doorbell_index.max_assignment+1);
942         if (adev->doorbell.num_doorbells == 0)
943                 return -EINVAL;
944
945         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
946          * paging queue doorbell use the second page. The
947          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
948          * doorbells are in the first page. So with paging queue enabled,
949          * the max num_doorbells should + 1 page (0x400 in dword)
950          */
951         if (adev->asic_type >= CHIP_VEGA10)
952                 adev->doorbell.num_doorbells += 0x400;
953
954         adev->doorbell.ptr = ioremap(adev->doorbell.base,
955                                      adev->doorbell.num_doorbells *
956                                      sizeof(u32));
957         if (adev->doorbell.ptr == NULL)
958                 return -ENOMEM;
959
960         return 0;
961 }
962
963 /**
964  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
965  *
966  * @adev: amdgpu_device pointer
967  *
968  * Tear down doorbell driver information (CIK)
969  */
970 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
971 {
972         iounmap(adev->doorbell.ptr);
973         adev->doorbell.ptr = NULL;
974 }
975
976
977
978 /*
979  * amdgpu_device_wb_*()
980  * Writeback is the method by which the GPU updates special pages in memory
981  * with the status of certain GPU events (fences, ring pointers,etc.).
982  */
983
984 /**
985  * amdgpu_device_wb_fini - Disable Writeback and free memory
986  *
987  * @adev: amdgpu_device pointer
988  *
989  * Disables Writeback and frees the Writeback memory (all asics).
990  * Used at driver shutdown.
991  */
992 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
993 {
994         if (adev->wb.wb_obj) {
995                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
996                                       &adev->wb.gpu_addr,
997                                       (void **)&adev->wb.wb);
998                 adev->wb.wb_obj = NULL;
999         }
1000 }
1001
1002 /**
1003  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1004  *
1005  * @adev: amdgpu_device pointer
1006  *
1007  * Initializes writeback and allocates writeback memory (all asics).
1008  * Used at driver startup.
1009  * Returns 0 on success or an -error on failure.
1010  */
1011 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1012 {
1013         int r;
1014
1015         if (adev->wb.wb_obj == NULL) {
1016                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1017                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1018                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1019                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1020                                             (void **)&adev->wb.wb);
1021                 if (r) {
1022                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1023                         return r;
1024                 }
1025
1026                 adev->wb.num_wb = AMDGPU_MAX_WB;
1027                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1028
1029                 /* clear wb memory */
1030                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1031         }
1032
1033         return 0;
1034 }
1035
1036 /**
1037  * amdgpu_device_wb_get - Allocate a wb entry
1038  *
1039  * @adev: amdgpu_device pointer
1040  * @wb: wb index
1041  *
1042  * Allocate a wb slot for use by the driver (all asics).
1043  * Returns 0 on success or -EINVAL on failure.
1044  */
1045 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1046 {
1047         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1048
1049         if (offset < adev->wb.num_wb) {
1050                 __set_bit(offset, adev->wb.used);
1051                 *wb = offset << 3; /* convert to dw offset */
1052                 return 0;
1053         } else {
1054                 return -EINVAL;
1055         }
1056 }
1057
1058 /**
1059  * amdgpu_device_wb_free - Free a wb entry
1060  *
1061  * @adev: amdgpu_device pointer
1062  * @wb: wb index
1063  *
1064  * Free a wb slot allocated for use by the driver (all asics)
1065  */
1066 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1067 {
1068         wb >>= 3;
1069         if (wb < adev->wb.num_wb)
1070                 __clear_bit(wb, adev->wb.used);
1071 }
1072
1073 /**
1074  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1075  *
1076  * @adev: amdgpu_device pointer
1077  *
1078  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1079  * to fail, but if any of the BARs is not accessible after the size we abort
1080  * driver loading by returning -ENODEV.
1081  */
1082 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1083 {
1084         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1085         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1086         struct pci_bus *root;
1087         struct resource *res;
1088         unsigned i;
1089         u16 cmd;
1090         int r;
1091
1092         /* Bypass for VF */
1093         if (amdgpu_sriov_vf(adev))
1094                 return 0;
1095
1096         /* skip if the bios has already enabled large BAR */
1097         if (adev->gmc.real_vram_size &&
1098             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1099                 return 0;
1100
1101         /* Check if the root BUS has 64bit memory resources */
1102         root = adev->pdev->bus;
1103         while (root->parent)
1104                 root = root->parent;
1105
1106         pci_bus_for_each_resource(root, res, i) {
1107                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1108                     res->start > 0x100000000ull)
1109                         break;
1110         }
1111
1112         /* Trying to resize is pointless without a root hub window above 4GB */
1113         if (!res)
1114                 return 0;
1115
1116         /* Disable memory decoding while we change the BAR addresses and size */
1117         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1118         pci_write_config_word(adev->pdev, PCI_COMMAND,
1119                               cmd & ~PCI_COMMAND_MEMORY);
1120
1121         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1122         amdgpu_device_doorbell_fini(adev);
1123         if (adev->asic_type >= CHIP_BONAIRE)
1124                 pci_release_resource(adev->pdev, 2);
1125
1126         pci_release_resource(adev->pdev, 0);
1127
1128         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1129         if (r == -ENOSPC)
1130                 DRM_INFO("Not enough PCI address space for a large BAR.");
1131         else if (r && r != -ENOTSUPP)
1132                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1133
1134         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1135
1136         /* When the doorbell or fb BAR isn't available we have no chance of
1137          * using the device.
1138          */
1139         r = amdgpu_device_doorbell_init(adev);
1140         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1141                 return -ENODEV;
1142
1143         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1144
1145         return 0;
1146 }
1147
1148 /*
1149  * GPU helpers function.
1150  */
1151 /**
1152  * amdgpu_device_need_post - check if the hw need post or not
1153  *
1154  * @adev: amdgpu_device pointer
1155  *
1156  * Check if the asic has been initialized (all asics) at driver startup
1157  * or post is needed if  hw reset is performed.
1158  * Returns true if need or false if not.
1159  */
1160 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1161 {
1162         uint32_t reg;
1163
1164         if (amdgpu_sriov_vf(adev))
1165                 return false;
1166
1167         if (amdgpu_passthrough(adev)) {
1168                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1169                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1170                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1171                  * vpost executed for smc version below 22.15
1172                  */
1173                 if (adev->asic_type == CHIP_FIJI) {
1174                         int err;
1175                         uint32_t fw_ver;
1176                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1177                         /* force vPost if error occured */
1178                         if (err)
1179                                 return true;
1180
1181                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1182                         if (fw_ver < 0x00160e00)
1183                                 return true;
1184                 }
1185         }
1186
1187         if (adev->has_hw_reset) {
1188                 adev->has_hw_reset = false;
1189                 return true;
1190         }
1191
1192         /* bios scratch used on CIK+ */
1193         if (adev->asic_type >= CHIP_BONAIRE)
1194                 return amdgpu_atombios_scratch_need_asic_init(adev);
1195
1196         /* check MEM_SIZE for older asics */
1197         reg = amdgpu_asic_get_config_memsize(adev);
1198
1199         if ((reg != 0) && (reg != 0xffffffff))
1200                 return false;
1201
1202         return true;
1203 }
1204
1205 /* if we get transitioned to only one device, take VGA back */
1206 /**
1207  * amdgpu_device_vga_set_decode - enable/disable vga decode
1208  *
1209  * @cookie: amdgpu_device pointer
1210  * @state: enable/disable vga decode
1211  *
1212  * Enable/disable vga decode (all asics).
1213  * Returns VGA resource flags.
1214  */
1215 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1216 {
1217         struct amdgpu_device *adev = cookie;
1218         amdgpu_asic_set_vga_state(adev, state);
1219         if (state)
1220                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1221                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1222         else
1223                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224 }
1225
1226 /**
1227  * amdgpu_device_check_block_size - validate the vm block size
1228  *
1229  * @adev: amdgpu_device pointer
1230  *
1231  * Validates the vm block size specified via module parameter.
1232  * The vm block size defines number of bits in page table versus page directory,
1233  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1234  * page table and the remaining bits are in the page directory.
1235  */
1236 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1237 {
1238         /* defines number of bits in page table versus page directory,
1239          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240          * page table and the remaining bits are in the page directory */
1241         if (amdgpu_vm_block_size == -1)
1242                 return;
1243
1244         if (amdgpu_vm_block_size < 9) {
1245                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1246                          amdgpu_vm_block_size);
1247                 amdgpu_vm_block_size = -1;
1248         }
1249 }
1250
1251 /**
1252  * amdgpu_device_check_vm_size - validate the vm size
1253  *
1254  * @adev: amdgpu_device pointer
1255  *
1256  * Validates the vm size in GB specified via module parameter.
1257  * The VM size is the size of the GPU virtual memory space in GB.
1258  */
1259 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1260 {
1261         /* no need to check the default value */
1262         if (amdgpu_vm_size == -1)
1263                 return;
1264
1265         if (amdgpu_vm_size < 1) {
1266                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1267                          amdgpu_vm_size);
1268                 amdgpu_vm_size = -1;
1269         }
1270 }
1271
1272 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1273 {
1274         struct sysinfo si;
1275         bool is_os_64 = (sizeof(void *) == 8);
1276         uint64_t total_memory;
1277         uint64_t dram_size_seven_GB = 0x1B8000000;
1278         uint64_t dram_size_three_GB = 0xB8000000;
1279
1280         if (amdgpu_smu_memory_pool_size == 0)
1281                 return;
1282
1283         if (!is_os_64) {
1284                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1285                 goto def_value;
1286         }
1287         si_meminfo(&si);
1288         total_memory = (uint64_t)si.totalram * si.mem_unit;
1289
1290         if ((amdgpu_smu_memory_pool_size == 1) ||
1291                 (amdgpu_smu_memory_pool_size == 2)) {
1292                 if (total_memory < dram_size_three_GB)
1293                         goto def_value1;
1294         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1295                 (amdgpu_smu_memory_pool_size == 8)) {
1296                 if (total_memory < dram_size_seven_GB)
1297                         goto def_value1;
1298         } else {
1299                 DRM_WARN("Smu memory pool size not supported\n");
1300                 goto def_value;
1301         }
1302         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1303
1304         return;
1305
1306 def_value1:
1307         DRM_WARN("No enough system memory\n");
1308 def_value:
1309         adev->pm.smu_prv_buffer_size = 0;
1310 }
1311
1312 /**
1313  * amdgpu_device_check_arguments - validate module params
1314  *
1315  * @adev: amdgpu_device pointer
1316  *
1317  * Validates certain module parameters and updates
1318  * the associated values used by the driver (all asics).
1319  */
1320 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1321 {
1322         if (amdgpu_sched_jobs < 4) {
1323                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1324                          amdgpu_sched_jobs);
1325                 amdgpu_sched_jobs = 4;
1326         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1327                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1328                          amdgpu_sched_jobs);
1329                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1330         }
1331
1332         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1333                 /* gart size must be greater or equal to 32M */
1334                 dev_warn(adev->dev, "gart size (%d) too small\n",
1335                          amdgpu_gart_size);
1336                 amdgpu_gart_size = -1;
1337         }
1338
1339         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1340                 /* gtt size must be greater or equal to 32M */
1341                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1342                                  amdgpu_gtt_size);
1343                 amdgpu_gtt_size = -1;
1344         }
1345
1346         /* valid range is between 4 and 9 inclusive */
1347         if (amdgpu_vm_fragment_size != -1 &&
1348             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1349                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1350                 amdgpu_vm_fragment_size = -1;
1351         }
1352
1353         if (amdgpu_sched_hw_submission < 2) {
1354                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1355                          amdgpu_sched_hw_submission);
1356                 amdgpu_sched_hw_submission = 2;
1357         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1358                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1359                          amdgpu_sched_hw_submission);
1360                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1361         }
1362
1363         amdgpu_device_check_smu_prv_buffer_size(adev);
1364
1365         amdgpu_device_check_vm_size(adev);
1366
1367         amdgpu_device_check_block_size(adev);
1368
1369         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1370
1371         amdgpu_gmc_tmz_set(adev);
1372
1373         if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1374                 amdgpu_num_kcq = 8;
1375                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1376         }
1377
1378         amdgpu_gmc_noretry_set(adev);
1379
1380         return 0;
1381 }
1382
1383 /**
1384  * amdgpu_switcheroo_set_state - set switcheroo state
1385  *
1386  * @pdev: pci dev pointer
1387  * @state: vga_switcheroo state
1388  *
1389  * Callback for the switcheroo driver.  Suspends or resumes the
1390  * the asics before or after it is powered up using ACPI methods.
1391  */
1392 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1393                                         enum vga_switcheroo_state state)
1394 {
1395         struct drm_device *dev = pci_get_drvdata(pdev);
1396         int r;
1397
1398         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1399                 return;
1400
1401         if (state == VGA_SWITCHEROO_ON) {
1402                 pr_info("switched on\n");
1403                 /* don't suspend or resume card normally */
1404                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1405
1406                 pci_set_power_state(dev->pdev, PCI_D0);
1407                 amdgpu_device_load_pci_state(dev->pdev);
1408                 r = pci_enable_device(dev->pdev);
1409                 if (r)
1410                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1411                 amdgpu_device_resume(dev, true);
1412
1413                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1414                 drm_kms_helper_poll_enable(dev);
1415         } else {
1416                 pr_info("switched off\n");
1417                 drm_kms_helper_poll_disable(dev);
1418                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1419                 amdgpu_device_suspend(dev, true);
1420                 amdgpu_device_cache_pci_state(dev->pdev);
1421                 /* Shut down the device */
1422                 pci_disable_device(dev->pdev);
1423                 pci_set_power_state(dev->pdev, PCI_D3cold);
1424                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1425         }
1426 }
1427
1428 /**
1429  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1430  *
1431  * @pdev: pci dev pointer
1432  *
1433  * Callback for the switcheroo driver.  Check of the switcheroo
1434  * state can be changed.
1435  * Returns true if the state can be changed, false if not.
1436  */
1437 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1438 {
1439         struct drm_device *dev = pci_get_drvdata(pdev);
1440
1441         /*
1442         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1443         * locking inversion with the driver load path. And the access here is
1444         * completely racy anyway. So don't bother with locking for now.
1445         */
1446         return atomic_read(&dev->open_count) == 0;
1447 }
1448
1449 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1450         .set_gpu_state = amdgpu_switcheroo_set_state,
1451         .reprobe = NULL,
1452         .can_switch = amdgpu_switcheroo_can_switch,
1453 };
1454
1455 /**
1456  * amdgpu_device_ip_set_clockgating_state - set the CG state
1457  *
1458  * @dev: amdgpu_device pointer
1459  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1460  * @state: clockgating state (gate or ungate)
1461  *
1462  * Sets the requested clockgating state for all instances of
1463  * the hardware IP specified.
1464  * Returns the error code from the last instance.
1465  */
1466 int amdgpu_device_ip_set_clockgating_state(void *dev,
1467                                            enum amd_ip_block_type block_type,
1468                                            enum amd_clockgating_state state)
1469 {
1470         struct amdgpu_device *adev = dev;
1471         int i, r = 0;
1472
1473         for (i = 0; i < adev->num_ip_blocks; i++) {
1474                 if (!adev->ip_blocks[i].status.valid)
1475                         continue;
1476                 if (adev->ip_blocks[i].version->type != block_type)
1477                         continue;
1478                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1479                         continue;
1480                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1481                         (void *)adev, state);
1482                 if (r)
1483                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1484                                   adev->ip_blocks[i].version->funcs->name, r);
1485         }
1486         return r;
1487 }
1488
1489 /**
1490  * amdgpu_device_ip_set_powergating_state - set the PG state
1491  *
1492  * @dev: amdgpu_device pointer
1493  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1494  * @state: powergating state (gate or ungate)
1495  *
1496  * Sets the requested powergating state for all instances of
1497  * the hardware IP specified.
1498  * Returns the error code from the last instance.
1499  */
1500 int amdgpu_device_ip_set_powergating_state(void *dev,
1501                                            enum amd_ip_block_type block_type,
1502                                            enum amd_powergating_state state)
1503 {
1504         struct amdgpu_device *adev = dev;
1505         int i, r = 0;
1506
1507         for (i = 0; i < adev->num_ip_blocks; i++) {
1508                 if (!adev->ip_blocks[i].status.valid)
1509                         continue;
1510                 if (adev->ip_blocks[i].version->type != block_type)
1511                         continue;
1512                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1513                         continue;
1514                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1515                         (void *)adev, state);
1516                 if (r)
1517                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1518                                   adev->ip_blocks[i].version->funcs->name, r);
1519         }
1520         return r;
1521 }
1522
1523 /**
1524  * amdgpu_device_ip_get_clockgating_state - get the CG state
1525  *
1526  * @adev: amdgpu_device pointer
1527  * @flags: clockgating feature flags
1528  *
1529  * Walks the list of IPs on the device and updates the clockgating
1530  * flags for each IP.
1531  * Updates @flags with the feature flags for each hardware IP where
1532  * clockgating is enabled.
1533  */
1534 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1535                                             u32 *flags)
1536 {
1537         int i;
1538
1539         for (i = 0; i < adev->num_ip_blocks; i++) {
1540                 if (!adev->ip_blocks[i].status.valid)
1541                         continue;
1542                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1543                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1544         }
1545 }
1546
1547 /**
1548  * amdgpu_device_ip_wait_for_idle - wait for idle
1549  *
1550  * @adev: amdgpu_device pointer
1551  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1552  *
1553  * Waits for the request hardware IP to be idle.
1554  * Returns 0 for success or a negative error code on failure.
1555  */
1556 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1557                                    enum amd_ip_block_type block_type)
1558 {
1559         int i, r;
1560
1561         for (i = 0; i < adev->num_ip_blocks; i++) {
1562                 if (!adev->ip_blocks[i].status.valid)
1563                         continue;
1564                 if (adev->ip_blocks[i].version->type == block_type) {
1565                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1566                         if (r)
1567                                 return r;
1568                         break;
1569                 }
1570         }
1571         return 0;
1572
1573 }
1574
1575 /**
1576  * amdgpu_device_ip_is_idle - is the hardware IP idle
1577  *
1578  * @adev: amdgpu_device pointer
1579  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1580  *
1581  * Check if the hardware IP is idle or not.
1582  * Returns true if it the IP is idle, false if not.
1583  */
1584 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1585                               enum amd_ip_block_type block_type)
1586 {
1587         int i;
1588
1589         for (i = 0; i < adev->num_ip_blocks; i++) {
1590                 if (!adev->ip_blocks[i].status.valid)
1591                         continue;
1592                 if (adev->ip_blocks[i].version->type == block_type)
1593                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1594         }
1595         return true;
1596
1597 }
1598
1599 /**
1600  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1601  *
1602  * @adev: amdgpu_device pointer
1603  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1604  *
1605  * Returns a pointer to the hardware IP block structure
1606  * if it exists for the asic, otherwise NULL.
1607  */
1608 struct amdgpu_ip_block *
1609 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1610                               enum amd_ip_block_type type)
1611 {
1612         int i;
1613
1614         for (i = 0; i < adev->num_ip_blocks; i++)
1615                 if (adev->ip_blocks[i].version->type == type)
1616                         return &adev->ip_blocks[i];
1617
1618         return NULL;
1619 }
1620
1621 /**
1622  * amdgpu_device_ip_block_version_cmp
1623  *
1624  * @adev: amdgpu_device pointer
1625  * @type: enum amd_ip_block_type
1626  * @major: major version
1627  * @minor: minor version
1628  *
1629  * return 0 if equal or greater
1630  * return 1 if smaller or the ip_block doesn't exist
1631  */
1632 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1633                                        enum amd_ip_block_type type,
1634                                        u32 major, u32 minor)
1635 {
1636         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1637
1638         if (ip_block && ((ip_block->version->major > major) ||
1639                         ((ip_block->version->major == major) &&
1640                         (ip_block->version->minor >= minor))))
1641                 return 0;
1642
1643         return 1;
1644 }
1645
1646 /**
1647  * amdgpu_device_ip_block_add
1648  *
1649  * @adev: amdgpu_device pointer
1650  * @ip_block_version: pointer to the IP to add
1651  *
1652  * Adds the IP block driver information to the collection of IPs
1653  * on the asic.
1654  */
1655 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1656                                const struct amdgpu_ip_block_version *ip_block_version)
1657 {
1658         if (!ip_block_version)
1659                 return -EINVAL;
1660
1661         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1662                   ip_block_version->funcs->name);
1663
1664         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1665
1666         return 0;
1667 }
1668
1669 /**
1670  * amdgpu_device_enable_virtual_display - enable virtual display feature
1671  *
1672  * @adev: amdgpu_device pointer
1673  *
1674  * Enabled the virtual display feature if the user has enabled it via
1675  * the module parameter virtual_display.  This feature provides a virtual
1676  * display hardware on headless boards or in virtualized environments.
1677  * This function parses and validates the configuration string specified by
1678  * the user and configues the virtual display configuration (number of
1679  * virtual connectors, crtcs, etc.) specified.
1680  */
1681 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1682 {
1683         adev->enable_virtual_display = false;
1684
1685         if (amdgpu_virtual_display) {
1686                 struct drm_device *ddev = adev_to_drm(adev);
1687                 const char *pci_address_name = pci_name(ddev->pdev);
1688                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1689
1690                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1691                 pciaddstr_tmp = pciaddstr;
1692                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1693                         pciaddname = strsep(&pciaddname_tmp, ",");
1694                         if (!strcmp("all", pciaddname)
1695                             || !strcmp(pci_address_name, pciaddname)) {
1696                                 long num_crtc;
1697                                 int res = -1;
1698
1699                                 adev->enable_virtual_display = true;
1700
1701                                 if (pciaddname_tmp)
1702                                         res = kstrtol(pciaddname_tmp, 10,
1703                                                       &num_crtc);
1704
1705                                 if (!res) {
1706                                         if (num_crtc < 1)
1707                                                 num_crtc = 1;
1708                                         if (num_crtc > 6)
1709                                                 num_crtc = 6;
1710                                         adev->mode_info.num_crtc = num_crtc;
1711                                 } else {
1712                                         adev->mode_info.num_crtc = 1;
1713                                 }
1714                                 break;
1715                         }
1716                 }
1717
1718                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1719                          amdgpu_virtual_display, pci_address_name,
1720                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1721
1722                 kfree(pciaddstr);
1723         }
1724 }
1725
1726 /**
1727  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1728  *
1729  * @adev: amdgpu_device pointer
1730  *
1731  * Parses the asic configuration parameters specified in the gpu info
1732  * firmware and makes them availale to the driver for use in configuring
1733  * the asic.
1734  * Returns 0 on success, -EINVAL on failure.
1735  */
1736 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1737 {
1738         const char *chip_name;
1739         char fw_name[40];
1740         int err;
1741         const struct gpu_info_firmware_header_v1_0 *hdr;
1742
1743         adev->firmware.gpu_info_fw = NULL;
1744
1745         if (adev->mman.discovery_bin) {
1746                 amdgpu_discovery_get_gfx_info(adev);
1747
1748                 /*
1749                  * FIXME: The bounding box is still needed by Navi12, so
1750                  * temporarily read it from gpu_info firmware. Should be droped
1751                  * when DAL no longer needs it.
1752                  */
1753                 if (adev->asic_type != CHIP_NAVI12)
1754                         return 0;
1755         }
1756
1757         switch (adev->asic_type) {
1758 #ifdef CONFIG_DRM_AMDGPU_SI
1759         case CHIP_VERDE:
1760         case CHIP_TAHITI:
1761         case CHIP_PITCAIRN:
1762         case CHIP_OLAND:
1763         case CHIP_HAINAN:
1764 #endif
1765 #ifdef CONFIG_DRM_AMDGPU_CIK
1766         case CHIP_BONAIRE:
1767         case CHIP_HAWAII:
1768         case CHIP_KAVERI:
1769         case CHIP_KABINI:
1770         case CHIP_MULLINS:
1771 #endif
1772         case CHIP_TOPAZ:
1773         case CHIP_TONGA:
1774         case CHIP_FIJI:
1775         case CHIP_POLARIS10:
1776         case CHIP_POLARIS11:
1777         case CHIP_POLARIS12:
1778         case CHIP_VEGAM:
1779         case CHIP_CARRIZO:
1780         case CHIP_STONEY:
1781         case CHIP_VEGA20:
1782         case CHIP_SIENNA_CICHLID:
1783         case CHIP_NAVY_FLOUNDER:
1784         default:
1785                 return 0;
1786         case CHIP_VEGA10:
1787                 chip_name = "vega10";
1788                 break;
1789         case CHIP_VEGA12:
1790                 chip_name = "vega12";
1791                 break;
1792         case CHIP_RAVEN:
1793                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1794                         chip_name = "raven2";
1795                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1796                         chip_name = "picasso";
1797                 else
1798                         chip_name = "raven";
1799                 break;
1800         case CHIP_ARCTURUS:
1801                 chip_name = "arcturus";
1802                 break;
1803         case CHIP_RENOIR:
1804                 chip_name = "renoir";
1805                 break;
1806         case CHIP_NAVI10:
1807                 chip_name = "navi10";
1808                 break;
1809         case CHIP_NAVI14:
1810                 chip_name = "navi14";
1811                 break;
1812         case CHIP_NAVI12:
1813                 chip_name = "navi12";
1814                 break;
1815         }
1816
1817         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1818         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1819         if (err) {
1820                 dev_err(adev->dev,
1821                         "Failed to load gpu_info firmware \"%s\"\n",
1822                         fw_name);
1823                 goto out;
1824         }
1825         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1826         if (err) {
1827                 dev_err(adev->dev,
1828                         "Failed to validate gpu_info firmware \"%s\"\n",
1829                         fw_name);
1830                 goto out;
1831         }
1832
1833         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1834         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1835
1836         switch (hdr->version_major) {
1837         case 1:
1838         {
1839                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1840                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1841                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1842
1843                 /*
1844                  * Should be droped when DAL no longer needs it.
1845                  */
1846                 if (adev->asic_type == CHIP_NAVI12)
1847                         goto parse_soc_bounding_box;
1848
1849                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1850                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1851                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1852                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1853                 adev->gfx.config.max_texture_channel_caches =
1854                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1855                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1856                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1857                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1858                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1859                 adev->gfx.config.double_offchip_lds_buf =
1860                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1861                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1862                 adev->gfx.cu_info.max_waves_per_simd =
1863                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1864                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1865                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1866                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1867                 if (hdr->version_minor >= 1) {
1868                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1869                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1870                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1871                         adev->gfx.config.num_sc_per_sh =
1872                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1873                         adev->gfx.config.num_packer_per_sc =
1874                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1875                 }
1876
1877 parse_soc_bounding_box:
1878                 /*
1879                  * soc bounding box info is not integrated in disocovery table,
1880                  * we always need to parse it from gpu info firmware if needed.
1881                  */
1882                 if (hdr->version_minor == 2) {
1883                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1884                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1885                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1886                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1887                 }
1888                 break;
1889         }
1890         default:
1891                 dev_err(adev->dev,
1892                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1893                 err = -EINVAL;
1894                 goto out;
1895         }
1896 out:
1897         return err;
1898 }
1899
1900 /**
1901  * amdgpu_device_ip_early_init - run early init for hardware IPs
1902  *
1903  * @adev: amdgpu_device pointer
1904  *
1905  * Early initialization pass for hardware IPs.  The hardware IPs that make
1906  * up each asic are discovered each IP's early_init callback is run.  This
1907  * is the first stage in initializing the asic.
1908  * Returns 0 on success, negative error code on failure.
1909  */
1910 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1911 {
1912         int i, r;
1913
1914         amdgpu_device_enable_virtual_display(adev);
1915
1916         if (amdgpu_sriov_vf(adev)) {
1917                 r = amdgpu_virt_request_full_gpu(adev, true);
1918                 if (r)
1919                         return r;
1920         }
1921
1922         switch (adev->asic_type) {
1923 #ifdef CONFIG_DRM_AMDGPU_SI
1924         case CHIP_VERDE:
1925         case CHIP_TAHITI:
1926         case CHIP_PITCAIRN:
1927         case CHIP_OLAND:
1928         case CHIP_HAINAN:
1929                 adev->family = AMDGPU_FAMILY_SI;
1930                 r = si_set_ip_blocks(adev);
1931                 if (r)
1932                         return r;
1933                 break;
1934 #endif
1935 #ifdef CONFIG_DRM_AMDGPU_CIK
1936         case CHIP_BONAIRE:
1937         case CHIP_HAWAII:
1938         case CHIP_KAVERI:
1939         case CHIP_KABINI:
1940         case CHIP_MULLINS:
1941                 if (adev->flags & AMD_IS_APU)
1942                         adev->family = AMDGPU_FAMILY_KV;
1943                 else
1944                         adev->family = AMDGPU_FAMILY_CI;
1945
1946                 r = cik_set_ip_blocks(adev);
1947                 if (r)
1948                         return r;
1949                 break;
1950 #endif
1951         case CHIP_TOPAZ:
1952         case CHIP_TONGA:
1953         case CHIP_FIJI:
1954         case CHIP_POLARIS10:
1955         case CHIP_POLARIS11:
1956         case CHIP_POLARIS12:
1957         case CHIP_VEGAM:
1958         case CHIP_CARRIZO:
1959         case CHIP_STONEY:
1960                 if (adev->flags & AMD_IS_APU)
1961                         adev->family = AMDGPU_FAMILY_CZ;
1962                 else
1963                         adev->family = AMDGPU_FAMILY_VI;
1964
1965                 r = vi_set_ip_blocks(adev);
1966                 if (r)
1967                         return r;
1968                 break;
1969         case CHIP_VEGA10:
1970         case CHIP_VEGA12:
1971         case CHIP_VEGA20:
1972         case CHIP_RAVEN:
1973         case CHIP_ARCTURUS:
1974         case CHIP_RENOIR:
1975                 if (adev->flags & AMD_IS_APU)
1976                         adev->family = AMDGPU_FAMILY_RV;
1977                 else
1978                         adev->family = AMDGPU_FAMILY_AI;
1979
1980                 r = soc15_set_ip_blocks(adev);
1981                 if (r)
1982                         return r;
1983                 break;
1984         case  CHIP_NAVI10:
1985         case  CHIP_NAVI14:
1986         case  CHIP_NAVI12:
1987         case  CHIP_SIENNA_CICHLID:
1988         case  CHIP_NAVY_FLOUNDER:
1989                 adev->family = AMDGPU_FAMILY_NV;
1990
1991                 r = nv_set_ip_blocks(adev);
1992                 if (r)
1993                         return r;
1994                 break;
1995         default:
1996                 /* FIXME: not supported yet */
1997                 return -EINVAL;
1998         }
1999
2000         amdgpu_amdkfd_device_probe(adev);
2001
2002         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2003         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2004                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2005
2006         for (i = 0; i < adev->num_ip_blocks; i++) {
2007                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2008                         DRM_ERROR("disabled ip block: %d <%s>\n",
2009                                   i, adev->ip_blocks[i].version->funcs->name);
2010                         adev->ip_blocks[i].status.valid = false;
2011                 } else {
2012                         if (adev->ip_blocks[i].version->funcs->early_init) {
2013                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2014                                 if (r == -ENOENT) {
2015                                         adev->ip_blocks[i].status.valid = false;
2016                                 } else if (r) {
2017                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2018                                                   adev->ip_blocks[i].version->funcs->name, r);
2019                                         return r;
2020                                 } else {
2021                                         adev->ip_blocks[i].status.valid = true;
2022                                 }
2023                         } else {
2024                                 adev->ip_blocks[i].status.valid = true;
2025                         }
2026                 }
2027                 /* get the vbios after the asic_funcs are set up */
2028                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2029                         r = amdgpu_device_parse_gpu_info_fw(adev);
2030                         if (r)
2031                                 return r;
2032
2033                         /* Read BIOS */
2034                         if (!amdgpu_get_bios(adev))
2035                                 return -EINVAL;
2036
2037                         r = amdgpu_atombios_init(adev);
2038                         if (r) {
2039                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2040                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2041                                 return r;
2042                         }
2043                 }
2044         }
2045
2046         adev->cg_flags &= amdgpu_cg_mask;
2047         adev->pg_flags &= amdgpu_pg_mask;
2048
2049         return 0;
2050 }
2051
2052 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2053 {
2054         int i, r;
2055
2056         for (i = 0; i < adev->num_ip_blocks; i++) {
2057                 if (!adev->ip_blocks[i].status.sw)
2058                         continue;
2059                 if (adev->ip_blocks[i].status.hw)
2060                         continue;
2061                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2062                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2063                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2064                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2065                         if (r) {
2066                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2067                                           adev->ip_blocks[i].version->funcs->name, r);
2068                                 return r;
2069                         }
2070                         adev->ip_blocks[i].status.hw = true;
2071                 }
2072         }
2073
2074         return 0;
2075 }
2076
2077 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2078 {
2079         int i, r;
2080
2081         for (i = 0; i < adev->num_ip_blocks; i++) {
2082                 if (!adev->ip_blocks[i].status.sw)
2083                         continue;
2084                 if (adev->ip_blocks[i].status.hw)
2085                         continue;
2086                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2087                 if (r) {
2088                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2089                                   adev->ip_blocks[i].version->funcs->name, r);
2090                         return r;
2091                 }
2092                 adev->ip_blocks[i].status.hw = true;
2093         }
2094
2095         return 0;
2096 }
2097
2098 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2099 {
2100         int r = 0;
2101         int i;
2102         uint32_t smu_version;
2103
2104         if (adev->asic_type >= CHIP_VEGA10) {
2105                 for (i = 0; i < adev->num_ip_blocks; i++) {
2106                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2107                                 continue;
2108
2109                         /* no need to do the fw loading again if already done*/
2110                         if (adev->ip_blocks[i].status.hw == true)
2111                                 break;
2112
2113                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2114                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2115                                 if (r) {
2116                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2117                                                           adev->ip_blocks[i].version->funcs->name, r);
2118                                         return r;
2119                                 }
2120                         } else {
2121                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2122                                 if (r) {
2123                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2124                                                           adev->ip_blocks[i].version->funcs->name, r);
2125                                         return r;
2126                                 }
2127                         }
2128
2129                         adev->ip_blocks[i].status.hw = true;
2130                         break;
2131                 }
2132         }
2133
2134         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2135                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2136
2137         return r;
2138 }
2139
2140 /**
2141  * amdgpu_device_ip_init - run init for hardware IPs
2142  *
2143  * @adev: amdgpu_device pointer
2144  *
2145  * Main initialization pass for hardware IPs.  The list of all the hardware
2146  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2147  * are run.  sw_init initializes the software state associated with each IP
2148  * and hw_init initializes the hardware associated with each IP.
2149  * Returns 0 on success, negative error code on failure.
2150  */
2151 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2152 {
2153         int i, r;
2154
2155         r = amdgpu_ras_init(adev);
2156         if (r)
2157                 return r;
2158
2159         for (i = 0; i < adev->num_ip_blocks; i++) {
2160                 if (!adev->ip_blocks[i].status.valid)
2161                         continue;
2162                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2163                 if (r) {
2164                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2165                                   adev->ip_blocks[i].version->funcs->name, r);
2166                         goto init_failed;
2167                 }
2168                 adev->ip_blocks[i].status.sw = true;
2169
2170                 /* need to do gmc hw init early so we can allocate gpu mem */
2171                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2172                         r = amdgpu_device_vram_scratch_init(adev);
2173                         if (r) {
2174                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2175                                 goto init_failed;
2176                         }
2177                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2178                         if (r) {
2179                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2180                                 goto init_failed;
2181                         }
2182                         r = amdgpu_device_wb_init(adev);
2183                         if (r) {
2184                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2185                                 goto init_failed;
2186                         }
2187                         adev->ip_blocks[i].status.hw = true;
2188
2189                         /* right after GMC hw init, we create CSA */
2190                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2191                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2192                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2193                                                                 AMDGPU_CSA_SIZE);
2194                                 if (r) {
2195                                         DRM_ERROR("allocate CSA failed %d\n", r);
2196                                         goto init_failed;
2197                                 }
2198                         }
2199                 }
2200         }
2201
2202         if (amdgpu_sriov_vf(adev))
2203                 amdgpu_virt_init_data_exchange(adev);
2204
2205         r = amdgpu_ib_pool_init(adev);
2206         if (r) {
2207                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2208                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2209                 goto init_failed;
2210         }
2211
2212         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2213         if (r)
2214                 goto init_failed;
2215
2216         r = amdgpu_device_ip_hw_init_phase1(adev);
2217         if (r)
2218                 goto init_failed;
2219
2220         r = amdgpu_device_fw_loading(adev);
2221         if (r)
2222                 goto init_failed;
2223
2224         r = amdgpu_device_ip_hw_init_phase2(adev);
2225         if (r)
2226                 goto init_failed;
2227
2228         /*
2229          * retired pages will be loaded from eeprom and reserved here,
2230          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2231          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2232          * for I2C communication which only true at this point.
2233          *
2234          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2235          * failure from bad gpu situation and stop amdgpu init process
2236          * accordingly. For other failed cases, it will still release all
2237          * the resource and print error message, rather than returning one
2238          * negative value to upper level.
2239          *
2240          * Note: theoretically, this should be called before all vram allocations
2241          * to protect retired page from abusing
2242          */
2243         r = amdgpu_ras_recovery_init(adev);
2244         if (r)
2245                 goto init_failed;
2246
2247         if (adev->gmc.xgmi.num_physical_nodes > 1)
2248                 amdgpu_xgmi_add_device(adev);
2249         amdgpu_amdkfd_device_init(adev);
2250
2251         amdgpu_fru_get_product_info(adev);
2252
2253 init_failed:
2254         if (amdgpu_sriov_vf(adev))
2255                 amdgpu_virt_release_full_gpu(adev, true);
2256
2257         return r;
2258 }
2259
2260 /**
2261  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2262  *
2263  * @adev: amdgpu_device pointer
2264  *
2265  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2266  * this function before a GPU reset.  If the value is retained after a
2267  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2268  */
2269 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2270 {
2271         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2272 }
2273
2274 /**
2275  * amdgpu_device_check_vram_lost - check if vram is valid
2276  *
2277  * @adev: amdgpu_device pointer
2278  *
2279  * Checks the reset magic value written to the gart pointer in VRAM.
2280  * The driver calls this after a GPU reset to see if the contents of
2281  * VRAM is lost or now.
2282  * returns true if vram is lost, false if not.
2283  */
2284 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2285 {
2286         if (memcmp(adev->gart.ptr, adev->reset_magic,
2287                         AMDGPU_RESET_MAGIC_NUM))
2288                 return true;
2289
2290         if (!amdgpu_in_reset(adev))
2291                 return false;
2292
2293         /*
2294          * For all ASICs with baco/mode1 reset, the VRAM is
2295          * always assumed to be lost.
2296          */
2297         switch (amdgpu_asic_reset_method(adev)) {
2298         case AMD_RESET_METHOD_BACO:
2299         case AMD_RESET_METHOD_MODE1:
2300                 return true;
2301         default:
2302                 return false;
2303         }
2304 }
2305
2306 /**
2307  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2308  *
2309  * @adev: amdgpu_device pointer
2310  * @state: clockgating state (gate or ungate)
2311  *
2312  * The list of all the hardware IPs that make up the asic is walked and the
2313  * set_clockgating_state callbacks are run.
2314  * Late initialization pass enabling clockgating for hardware IPs.
2315  * Fini or suspend, pass disabling clockgating for hardware IPs.
2316  * Returns 0 on success, negative error code on failure.
2317  */
2318
2319 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2320                                                 enum amd_clockgating_state state)
2321 {
2322         int i, j, r;
2323
2324         if (amdgpu_emu_mode == 1)
2325                 return 0;
2326
2327         for (j = 0; j < adev->num_ip_blocks; j++) {
2328                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2329                 if (!adev->ip_blocks[i].status.late_initialized)
2330                         continue;
2331                 /* skip CG for VCE/UVD, it's handled specially */
2332                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2333                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2334                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2335                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2336                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2337                         /* enable clockgating to save power */
2338                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2339                                                                                      state);
2340                         if (r) {
2341                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2342                                           adev->ip_blocks[i].version->funcs->name, r);
2343                                 return r;
2344                         }
2345                 }
2346         }
2347
2348         return 0;
2349 }
2350
2351 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2352 {
2353         int i, j, r;
2354
2355         if (amdgpu_emu_mode == 1)
2356                 return 0;
2357
2358         for (j = 0; j < adev->num_ip_blocks; j++) {
2359                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2360                 if (!adev->ip_blocks[i].status.late_initialized)
2361                         continue;
2362                 /* skip CG for VCE/UVD, it's handled specially */
2363                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2364                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2365                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2366                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2367                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2368                         /* enable powergating to save power */
2369                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2370                                                                                         state);
2371                         if (r) {
2372                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2373                                           adev->ip_blocks[i].version->funcs->name, r);
2374                                 return r;
2375                         }
2376                 }
2377         }
2378         return 0;
2379 }
2380
2381 static int amdgpu_device_enable_mgpu_fan_boost(void)
2382 {
2383         struct amdgpu_gpu_instance *gpu_ins;
2384         struct amdgpu_device *adev;
2385         int i, ret = 0;
2386
2387         mutex_lock(&mgpu_info.mutex);
2388
2389         /*
2390          * MGPU fan boost feature should be enabled
2391          * only when there are two or more dGPUs in
2392          * the system
2393          */
2394         if (mgpu_info.num_dgpu < 2)
2395                 goto out;
2396
2397         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2398                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2399                 adev = gpu_ins->adev;
2400                 if (!(adev->flags & AMD_IS_APU) &&
2401                     !gpu_ins->mgpu_fan_enabled) {
2402                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2403                         if (ret)
2404                                 break;
2405
2406                         gpu_ins->mgpu_fan_enabled = 1;
2407                 }
2408         }
2409
2410 out:
2411         mutex_unlock(&mgpu_info.mutex);
2412
2413         return ret;
2414 }
2415
2416 /**
2417  * amdgpu_device_ip_late_init - run late init for hardware IPs
2418  *
2419  * @adev: amdgpu_device pointer
2420  *
2421  * Late initialization pass for hardware IPs.  The list of all the hardware
2422  * IPs that make up the asic is walked and the late_init callbacks are run.
2423  * late_init covers any special initialization that an IP requires
2424  * after all of the have been initialized or something that needs to happen
2425  * late in the init process.
2426  * Returns 0 on success, negative error code on failure.
2427  */
2428 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2429 {
2430         struct amdgpu_gpu_instance *gpu_instance;
2431         int i = 0, r;
2432
2433         for (i = 0; i < adev->num_ip_blocks; i++) {
2434                 if (!adev->ip_blocks[i].status.hw)
2435                         continue;
2436                 if (adev->ip_blocks[i].version->funcs->late_init) {
2437                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2438                         if (r) {
2439                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2440                                           adev->ip_blocks[i].version->funcs->name, r);
2441                                 return r;
2442                         }
2443                 }
2444                 adev->ip_blocks[i].status.late_initialized = true;
2445         }
2446
2447         amdgpu_ras_set_error_query_ready(adev, true);
2448
2449         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2450         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2451
2452         amdgpu_device_fill_reset_magic(adev);
2453
2454         r = amdgpu_device_enable_mgpu_fan_boost();
2455         if (r)
2456                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2457
2458
2459         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2460                 mutex_lock(&mgpu_info.mutex);
2461
2462                 /*
2463                  * Reset device p-state to low as this was booted with high.
2464                  *
2465                  * This should be performed only after all devices from the same
2466                  * hive get initialized.
2467                  *
2468                  * However, it's unknown how many device in the hive in advance.
2469                  * As this is counted one by one during devices initializations.
2470                  *
2471                  * So, we wait for all XGMI interlinked devices initialized.
2472                  * This may bring some delays as those devices may come from
2473                  * different hives. But that should be OK.
2474                  */
2475                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2476                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2477                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2478                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2479                                         continue;
2480
2481                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2482                                                 AMDGPU_XGMI_PSTATE_MIN);
2483                                 if (r) {
2484                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2485                                         break;
2486                                 }
2487                         }
2488                 }
2489
2490                 mutex_unlock(&mgpu_info.mutex);
2491         }
2492
2493         return 0;
2494 }
2495
2496 /**
2497  * amdgpu_device_ip_fini - run fini for hardware IPs
2498  *
2499  * @adev: amdgpu_device pointer
2500  *
2501  * Main teardown pass for hardware IPs.  The list of all the hardware
2502  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2503  * are run.  hw_fini tears down the hardware associated with each IP
2504  * and sw_fini tears down any software state associated with each IP.
2505  * Returns 0 on success, negative error code on failure.
2506  */
2507 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2508 {
2509         int i, r;
2510
2511         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2512                 amdgpu_virt_release_ras_err_handler_data(adev);
2513
2514         amdgpu_ras_pre_fini(adev);
2515
2516         if (adev->gmc.xgmi.num_physical_nodes > 1)
2517                 amdgpu_xgmi_remove_device(adev);
2518
2519         amdgpu_amdkfd_device_fini(adev);
2520
2521         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2522         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2523
2524         /* need to disable SMC first */
2525         for (i = 0; i < adev->num_ip_blocks; i++) {
2526                 if (!adev->ip_blocks[i].status.hw)
2527                         continue;
2528                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2529                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2530                         /* XXX handle errors */
2531                         if (r) {
2532                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2533                                           adev->ip_blocks[i].version->funcs->name, r);
2534                         }
2535                         adev->ip_blocks[i].status.hw = false;
2536                         break;
2537                 }
2538         }
2539
2540         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2541                 if (!adev->ip_blocks[i].status.hw)
2542                         continue;
2543
2544                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2545                 /* XXX handle errors */
2546                 if (r) {
2547                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2548                                   adev->ip_blocks[i].version->funcs->name, r);
2549                 }
2550
2551                 adev->ip_blocks[i].status.hw = false;
2552         }
2553
2554
2555         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2556                 if (!adev->ip_blocks[i].status.sw)
2557                         continue;
2558
2559                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2560                         amdgpu_ucode_free_bo(adev);
2561                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2562                         amdgpu_device_wb_fini(adev);
2563                         amdgpu_device_vram_scratch_fini(adev);
2564                         amdgpu_ib_pool_fini(adev);
2565                 }
2566
2567                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2568                 /* XXX handle errors */
2569                 if (r) {
2570                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2571                                   adev->ip_blocks[i].version->funcs->name, r);
2572                 }
2573                 adev->ip_blocks[i].status.sw = false;
2574                 adev->ip_blocks[i].status.valid = false;
2575         }
2576
2577         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2578                 if (!adev->ip_blocks[i].status.late_initialized)
2579                         continue;
2580                 if (adev->ip_blocks[i].version->funcs->late_fini)
2581                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2582                 adev->ip_blocks[i].status.late_initialized = false;
2583         }
2584
2585         amdgpu_ras_fini(adev);
2586
2587         if (amdgpu_sriov_vf(adev))
2588                 if (amdgpu_virt_release_full_gpu(adev, false))
2589                         DRM_ERROR("failed to release exclusive mode on fini\n");
2590
2591         return 0;
2592 }
2593
2594 /**
2595  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2596  *
2597  * @work: work_struct.
2598  */
2599 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2600 {
2601         struct amdgpu_device *adev =
2602                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2603         int r;
2604
2605         r = amdgpu_ib_ring_tests(adev);
2606         if (r)
2607                 DRM_ERROR("ib ring test failed (%d).\n", r);
2608 }
2609
2610 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2611 {
2612         struct amdgpu_device *adev =
2613                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2614
2615         mutex_lock(&adev->gfx.gfx_off_mutex);
2616         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2617                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2618                         adev->gfx.gfx_off_state = true;
2619         }
2620         mutex_unlock(&adev->gfx.gfx_off_mutex);
2621 }
2622
2623 /**
2624  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2625  *
2626  * @adev: amdgpu_device pointer
2627  *
2628  * Main suspend function for hardware IPs.  The list of all the hardware
2629  * IPs that make up the asic is walked, clockgating is disabled and the
2630  * suspend callbacks are run.  suspend puts the hardware and software state
2631  * in each IP into a state suitable for suspend.
2632  * Returns 0 on success, negative error code on failure.
2633  */
2634 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2635 {
2636         int i, r;
2637
2638         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2639         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2640
2641         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2642                 if (!adev->ip_blocks[i].status.valid)
2643                         continue;
2644
2645                 /* displays are handled separately */
2646                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2647                         continue;
2648
2649                 /* XXX handle errors */
2650                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2651                 /* XXX handle errors */
2652                 if (r) {
2653                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2654                                   adev->ip_blocks[i].version->funcs->name, r);
2655                         return r;
2656                 }
2657
2658                 adev->ip_blocks[i].status.hw = false;
2659         }
2660
2661         return 0;
2662 }
2663
2664 /**
2665  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2666  *
2667  * @adev: amdgpu_device pointer
2668  *
2669  * Main suspend function for hardware IPs.  The list of all the hardware
2670  * IPs that make up the asic is walked, clockgating is disabled and the
2671  * suspend callbacks are run.  suspend puts the hardware and software state
2672  * in each IP into a state suitable for suspend.
2673  * Returns 0 on success, negative error code on failure.
2674  */
2675 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2676 {
2677         int i, r;
2678
2679         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2680                 if (!adev->ip_blocks[i].status.valid)
2681                         continue;
2682                 /* displays are handled in phase1 */
2683                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2684                         continue;
2685                 /* PSP lost connection when err_event_athub occurs */
2686                 if (amdgpu_ras_intr_triggered() &&
2687                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2688                         adev->ip_blocks[i].status.hw = false;
2689                         continue;
2690                 }
2691                 /* XXX handle errors */
2692                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2693                 /* XXX handle errors */
2694                 if (r) {
2695                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2696                                   adev->ip_blocks[i].version->funcs->name, r);
2697                 }
2698                 adev->ip_blocks[i].status.hw = false;
2699                 /* handle putting the SMC in the appropriate state */
2700                 if(!amdgpu_sriov_vf(adev)){
2701                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2702                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2703                                 if (r) {
2704                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2705                                                         adev->mp1_state, r);
2706                                         return r;
2707                                 }
2708                         }
2709                 }
2710                 adev->ip_blocks[i].status.hw = false;
2711         }
2712
2713         return 0;
2714 }
2715
2716 /**
2717  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2718  *
2719  * @adev: amdgpu_device pointer
2720  *
2721  * Main suspend function for hardware IPs.  The list of all the hardware
2722  * IPs that make up the asic is walked, clockgating is disabled and the
2723  * suspend callbacks are run.  suspend puts the hardware and software state
2724  * in each IP into a state suitable for suspend.
2725  * Returns 0 on success, negative error code on failure.
2726  */
2727 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2728 {
2729         int r;
2730
2731         if (amdgpu_sriov_vf(adev))
2732                 amdgpu_virt_request_full_gpu(adev, false);
2733
2734         r = amdgpu_device_ip_suspend_phase1(adev);
2735         if (r)
2736                 return r;
2737         r = amdgpu_device_ip_suspend_phase2(adev);
2738
2739         if (amdgpu_sriov_vf(adev))
2740                 amdgpu_virt_release_full_gpu(adev, false);
2741
2742         return r;
2743 }
2744
2745 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2746 {
2747         int i, r;
2748
2749         static enum amd_ip_block_type ip_order[] = {
2750                 AMD_IP_BLOCK_TYPE_GMC,
2751                 AMD_IP_BLOCK_TYPE_COMMON,
2752                 AMD_IP_BLOCK_TYPE_PSP,
2753                 AMD_IP_BLOCK_TYPE_IH,
2754         };
2755
2756         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2757                 int j;
2758                 struct amdgpu_ip_block *block;
2759
2760                 block = &adev->ip_blocks[i];
2761                 block->status.hw = false;
2762
2763                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2764
2765                         if (block->version->type != ip_order[j] ||
2766                                 !block->status.valid)
2767                                 continue;
2768
2769                         r = block->version->funcs->hw_init(adev);
2770                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2771                         if (r)
2772                                 return r;
2773                         block->status.hw = true;
2774                 }
2775         }
2776
2777         return 0;
2778 }
2779
2780 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2781 {
2782         int i, r;
2783
2784         static enum amd_ip_block_type ip_order[] = {
2785                 AMD_IP_BLOCK_TYPE_SMC,
2786                 AMD_IP_BLOCK_TYPE_DCE,
2787                 AMD_IP_BLOCK_TYPE_GFX,
2788                 AMD_IP_BLOCK_TYPE_SDMA,
2789                 AMD_IP_BLOCK_TYPE_UVD,
2790                 AMD_IP_BLOCK_TYPE_VCE,
2791                 AMD_IP_BLOCK_TYPE_VCN
2792         };
2793
2794         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2795                 int j;
2796                 struct amdgpu_ip_block *block;
2797
2798                 for (j = 0; j < adev->num_ip_blocks; j++) {
2799                         block = &adev->ip_blocks[j];
2800
2801                         if (block->version->type != ip_order[i] ||
2802                                 !block->status.valid ||
2803                                 block->status.hw)
2804                                 continue;
2805
2806                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2807                                 r = block->version->funcs->resume(adev);
2808                         else
2809                                 r = block->version->funcs->hw_init(adev);
2810
2811                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2812                         if (r)
2813                                 return r;
2814                         block->status.hw = true;
2815                 }
2816         }
2817
2818         return 0;
2819 }
2820
2821 /**
2822  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2823  *
2824  * @adev: amdgpu_device pointer
2825  *
2826  * First resume function for hardware IPs.  The list of all the hardware
2827  * IPs that make up the asic is walked and the resume callbacks are run for
2828  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2829  * after a suspend and updates the software state as necessary.  This
2830  * function is also used for restoring the GPU after a GPU reset.
2831  * Returns 0 on success, negative error code on failure.
2832  */
2833 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2834 {
2835         int i, r;
2836
2837         for (i = 0; i < adev->num_ip_blocks; i++) {
2838                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2839                         continue;
2840                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2841                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2842                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2843
2844                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2845                         if (r) {
2846                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2847                                           adev->ip_blocks[i].version->funcs->name, r);
2848                                 return r;
2849                         }
2850                         adev->ip_blocks[i].status.hw = true;
2851                 }
2852         }
2853
2854         return 0;
2855 }
2856
2857 /**
2858  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2859  *
2860  * @adev: amdgpu_device pointer
2861  *
2862  * First resume function for hardware IPs.  The list of all the hardware
2863  * IPs that make up the asic is walked and the resume callbacks are run for
2864  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2865  * functional state after a suspend and updates the software state as
2866  * necessary.  This function is also used for restoring the GPU after a GPU
2867  * reset.
2868  * Returns 0 on success, negative error code on failure.
2869  */
2870 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2871 {
2872         int i, r;
2873
2874         for (i = 0; i < adev->num_ip_blocks; i++) {
2875                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2876                         continue;
2877                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2878                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2879                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2880                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2881                         continue;
2882                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2883                 if (r) {
2884                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2885                                   adev->ip_blocks[i].version->funcs->name, r);
2886                         return r;
2887                 }
2888                 adev->ip_blocks[i].status.hw = true;
2889         }
2890
2891         return 0;
2892 }
2893
2894 /**
2895  * amdgpu_device_ip_resume - run resume for hardware IPs
2896  *
2897  * @adev: amdgpu_device pointer
2898  *
2899  * Main resume function for hardware IPs.  The hardware IPs
2900  * are split into two resume functions because they are
2901  * are also used in in recovering from a GPU reset and some additional
2902  * steps need to be take between them.  In this case (S3/S4) they are
2903  * run sequentially.
2904  * Returns 0 on success, negative error code on failure.
2905  */
2906 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2907 {
2908         int r;
2909
2910         r = amdgpu_device_ip_resume_phase1(adev);
2911         if (r)
2912                 return r;
2913
2914         r = amdgpu_device_fw_loading(adev);
2915         if (r)
2916                 return r;
2917
2918         r = amdgpu_device_ip_resume_phase2(adev);
2919
2920         return r;
2921 }
2922
2923 /**
2924  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2925  *
2926  * @adev: amdgpu_device pointer
2927  *
2928  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2929  */
2930 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2931 {
2932         if (amdgpu_sriov_vf(adev)) {
2933                 if (adev->is_atom_fw) {
2934                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2935                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2936                 } else {
2937                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2938                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2939                 }
2940
2941                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2942                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2943         }
2944 }
2945
2946 /**
2947  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2948  *
2949  * @asic_type: AMD asic type
2950  *
2951  * Check if there is DC (new modesetting infrastructre) support for an asic.
2952  * returns true if DC has support, false if not.
2953  */
2954 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2955 {
2956         switch (asic_type) {
2957 #if defined(CONFIG_DRM_AMD_DC)
2958 #if defined(CONFIG_DRM_AMD_DC_SI)
2959         case CHIP_TAHITI:
2960         case CHIP_PITCAIRN:
2961         case CHIP_VERDE:
2962         case CHIP_OLAND:
2963 #endif
2964         case CHIP_BONAIRE:
2965         case CHIP_KAVERI:
2966         case CHIP_KABINI:
2967         case CHIP_MULLINS:
2968                 /*
2969                  * We have systems in the wild with these ASICs that require
2970                  * LVDS and VGA support which is not supported with DC.
2971                  *
2972                  * Fallback to the non-DC driver here by default so as not to
2973                  * cause regressions.
2974                  */
2975                 return amdgpu_dc > 0;
2976         case CHIP_HAWAII:
2977         case CHIP_CARRIZO:
2978         case CHIP_STONEY:
2979         case CHIP_POLARIS10:
2980         case CHIP_POLARIS11:
2981         case CHIP_POLARIS12:
2982         case CHIP_VEGAM:
2983         case CHIP_TONGA:
2984         case CHIP_FIJI:
2985         case CHIP_VEGA10:
2986         case CHIP_VEGA12:
2987         case CHIP_VEGA20:
2988 #if defined(CONFIG_DRM_AMD_DC_DCN)
2989         case CHIP_RAVEN:
2990         case CHIP_NAVI10:
2991         case CHIP_NAVI14:
2992         case CHIP_NAVI12:
2993         case CHIP_RENOIR:
2994 #endif
2995 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2996         case CHIP_SIENNA_CICHLID:
2997         case CHIP_NAVY_FLOUNDER:
2998 #endif
2999                 return amdgpu_dc != 0;
3000 #endif
3001         default:
3002                 if (amdgpu_dc > 0)
3003                         DRM_INFO("Display Core has been requested via kernel parameter "
3004                                          "but isn't supported by ASIC, ignoring\n");
3005                 return false;
3006         }
3007 }
3008
3009 /**
3010  * amdgpu_device_has_dc_support - check if dc is supported
3011  *
3012  * @adev: amdgpu_device_pointer
3013  *
3014  * Returns true for supported, false for not supported
3015  */
3016 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3017 {
3018         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3019                 return false;
3020
3021         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3022 }
3023
3024
3025 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3026 {
3027         struct amdgpu_device *adev =
3028                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3029         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3030
3031         /* It's a bug to not have a hive within this function */
3032         if (WARN_ON(!hive))
3033                 return;
3034
3035         /*
3036          * Use task barrier to synchronize all xgmi reset works across the
3037          * hive. task_barrier_enter and task_barrier_exit will block
3038          * until all the threads running the xgmi reset works reach
3039          * those points. task_barrier_full will do both blocks.
3040          */
3041         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3042
3043                 task_barrier_enter(&hive->tb);
3044                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3045
3046                 if (adev->asic_reset_res)
3047                         goto fail;
3048
3049                 task_barrier_exit(&hive->tb);
3050                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3051
3052                 if (adev->asic_reset_res)
3053                         goto fail;
3054
3055                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3056                         adev->mmhub.funcs->reset_ras_error_count(adev);
3057         } else {
3058
3059                 task_barrier_full(&hive->tb);
3060                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3061         }
3062
3063 fail:
3064         if (adev->asic_reset_res)
3065                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3066                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3067         amdgpu_put_xgmi_hive(hive);
3068 }
3069
3070 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3071 {
3072         char *input = amdgpu_lockup_timeout;
3073         char *timeout_setting = NULL;
3074         int index = 0;
3075         long timeout;
3076         int ret = 0;
3077
3078         /*
3079          * By default timeout for non compute jobs is 10000.
3080          * And there is no timeout enforced on compute jobs.
3081          * In SR-IOV or passthrough mode, timeout for compute
3082          * jobs are 60000 by default.
3083          */
3084         adev->gfx_timeout = msecs_to_jiffies(10000);
3085         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3086         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3087                 adev->compute_timeout =  msecs_to_jiffies(60000);
3088         else
3089                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3090
3091         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3092                 while ((timeout_setting = strsep(&input, ",")) &&
3093                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3094                         ret = kstrtol(timeout_setting, 0, &timeout);
3095                         if (ret)
3096                                 return ret;
3097
3098                         if (timeout == 0) {
3099                                 index++;
3100                                 continue;
3101                         } else if (timeout < 0) {
3102                                 timeout = MAX_SCHEDULE_TIMEOUT;
3103                         } else {
3104                                 timeout = msecs_to_jiffies(timeout);
3105                         }
3106
3107                         switch (index++) {
3108                         case 0:
3109                                 adev->gfx_timeout = timeout;
3110                                 break;
3111                         case 1:
3112                                 adev->compute_timeout = timeout;
3113                                 break;
3114                         case 2:
3115                                 adev->sdma_timeout = timeout;
3116                                 break;
3117                         case 3:
3118                                 adev->video_timeout = timeout;
3119                                 break;
3120                         default:
3121                                 break;
3122                         }
3123                 }
3124                 /*
3125                  * There is only one value specified and
3126                  * it should apply to all non-compute jobs.
3127                  */
3128                 if (index == 1) {
3129                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3130                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3131                                 adev->compute_timeout = adev->gfx_timeout;
3132                 }
3133         }
3134
3135         return ret;
3136 }
3137
3138 static const struct attribute *amdgpu_dev_attributes[] = {
3139         &dev_attr_product_name.attr,
3140         &dev_attr_product_number.attr,
3141         &dev_attr_serial_number.attr,
3142         &dev_attr_pcie_replay_count.attr,
3143         NULL
3144 };
3145
3146
3147 /**
3148  * amdgpu_device_init - initialize the driver
3149  *
3150  * @adev: amdgpu_device pointer
3151  * @flags: driver flags
3152  *
3153  * Initializes the driver info and hw (all asics).
3154  * Returns 0 for success or an error on failure.
3155  * Called at driver startup.
3156  */
3157 int amdgpu_device_init(struct amdgpu_device *adev,
3158                        uint32_t flags)
3159 {
3160         struct drm_device *ddev = adev_to_drm(adev);
3161         struct pci_dev *pdev = adev->pdev;
3162         int r, i;
3163         bool boco = false;
3164         u32 max_MBps;
3165
3166         adev->shutdown = false;
3167         adev->flags = flags;
3168
3169         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3170                 adev->asic_type = amdgpu_force_asic_type;
3171         else
3172                 adev->asic_type = flags & AMD_ASIC_MASK;
3173
3174         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3175         if (amdgpu_emu_mode == 1)
3176                 adev->usec_timeout *= 10;
3177         adev->gmc.gart_size = 512 * 1024 * 1024;
3178         adev->accel_working = false;
3179         adev->num_rings = 0;
3180         adev->mman.buffer_funcs = NULL;
3181         adev->mman.buffer_funcs_ring = NULL;
3182         adev->vm_manager.vm_pte_funcs = NULL;
3183         adev->vm_manager.vm_pte_num_scheds = 0;
3184         adev->gmc.gmc_funcs = NULL;
3185         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3186         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3187
3188         adev->smc_rreg = &amdgpu_invalid_rreg;
3189         adev->smc_wreg = &amdgpu_invalid_wreg;
3190         adev->pcie_rreg = &amdgpu_invalid_rreg;
3191         adev->pcie_wreg = &amdgpu_invalid_wreg;
3192         adev->pciep_rreg = &amdgpu_invalid_rreg;
3193         adev->pciep_wreg = &amdgpu_invalid_wreg;
3194         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3195         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3196         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3197         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3198         adev->didt_rreg = &amdgpu_invalid_rreg;
3199         adev->didt_wreg = &amdgpu_invalid_wreg;
3200         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3201         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3202         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3203         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3204
3205         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3206                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3207                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3208
3209         /* mutex initialization are all done here so we
3210          * can recall function without having locking issues */
3211         atomic_set(&adev->irq.ih.lock, 0);
3212         mutex_init(&adev->firmware.mutex);
3213         mutex_init(&adev->pm.mutex);
3214         mutex_init(&adev->gfx.gpu_clock_mutex);
3215         mutex_init(&adev->srbm_mutex);
3216         mutex_init(&adev->gfx.pipe_reserve_mutex);
3217         mutex_init(&adev->gfx.gfx_off_mutex);
3218         mutex_init(&adev->grbm_idx_mutex);
3219         mutex_init(&adev->mn_lock);
3220         mutex_init(&adev->virt.vf_errors.lock);
3221         hash_init(adev->mn_hash);
3222         atomic_set(&adev->in_gpu_reset, 0);
3223         init_rwsem(&adev->reset_sem);
3224         mutex_init(&adev->psp.mutex);
3225         mutex_init(&adev->notifier_lock);
3226
3227         r = amdgpu_device_check_arguments(adev);
3228         if (r)
3229                 return r;
3230
3231         spin_lock_init(&adev->mmio_idx_lock);
3232         spin_lock_init(&adev->smc_idx_lock);
3233         spin_lock_init(&adev->pcie_idx_lock);
3234         spin_lock_init(&adev->uvd_ctx_idx_lock);
3235         spin_lock_init(&adev->didt_idx_lock);
3236         spin_lock_init(&adev->gc_cac_idx_lock);
3237         spin_lock_init(&adev->se_cac_idx_lock);
3238         spin_lock_init(&adev->audio_endpt_idx_lock);
3239         spin_lock_init(&adev->mm_stats.lock);
3240
3241         INIT_LIST_HEAD(&adev->shadow_list);
3242         mutex_init(&adev->shadow_list_lock);
3243
3244         INIT_DELAYED_WORK(&adev->delayed_init_work,
3245                           amdgpu_device_delayed_init_work_handler);
3246         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3247                           amdgpu_device_delay_enable_gfx_off);
3248
3249         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3250
3251         adev->gfx.gfx_off_req_count = 1;
3252         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3253
3254         atomic_set(&adev->throttling_logging_enabled, 1);
3255         /*
3256          * If throttling continues, logging will be performed every minute
3257          * to avoid log flooding. "-1" is subtracted since the thermal
3258          * throttling interrupt comes every second. Thus, the total logging
3259          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3260          * for throttling interrupt) = 60 seconds.
3261          */
3262         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3263         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3264
3265         /* Registers mapping */
3266         /* TODO: block userspace mapping of io register */
3267         if (adev->asic_type >= CHIP_BONAIRE) {
3268                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3269                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3270         } else {
3271                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3272                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3273         }
3274
3275         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3276         if (adev->rmmio == NULL) {
3277                 return -ENOMEM;
3278         }
3279         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3280         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3281
3282         /* io port mapping */
3283         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3284                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3285                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3286                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3287                         break;
3288                 }
3289         }
3290         if (adev->rio_mem == NULL)
3291                 DRM_INFO("PCI I/O BAR is not found.\n");
3292
3293         /* enable PCIE atomic ops */
3294         r = pci_enable_atomic_ops_to_root(adev->pdev,
3295                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3296                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3297         if (r) {
3298                 adev->have_atomics_support = false;
3299                 DRM_INFO("PCIE atomic ops is not supported\n");
3300         } else {
3301                 adev->have_atomics_support = true;
3302         }
3303
3304         amdgpu_device_get_pcie_info(adev);
3305
3306         if (amdgpu_mcbp)
3307                 DRM_INFO("MCBP is enabled\n");
3308
3309         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3310                 adev->enable_mes = true;
3311
3312         /* detect hw virtualization here */
3313         amdgpu_detect_virtualization(adev);
3314
3315         r = amdgpu_device_get_job_timeout_settings(adev);
3316         if (r) {
3317                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3318                 goto failed_unmap;
3319         }
3320
3321         /* early init functions */
3322         r = amdgpu_device_ip_early_init(adev);
3323         if (r)
3324                 goto failed_unmap;
3325
3326         /* doorbell bar mapping and doorbell index init*/
3327         amdgpu_device_doorbell_init(adev);
3328
3329         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3330         /* this will fail for cards that aren't VGA class devices, just
3331          * ignore it */
3332         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3333
3334         if (amdgpu_device_supports_boco(ddev))
3335                 boco = true;
3336         if (amdgpu_has_atpx() &&
3337             (amdgpu_is_atpx_hybrid() ||
3338              amdgpu_has_atpx_dgpu_power_cntl()) &&
3339             !pci_is_thunderbolt_attached(adev->pdev))
3340                 vga_switcheroo_register_client(adev->pdev,
3341                                                &amdgpu_switcheroo_ops, boco);
3342         if (boco)
3343                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3344
3345         if (amdgpu_emu_mode == 1) {
3346                 /* post the asic on emulation mode */
3347                 emu_soc_asic_init(adev);
3348                 goto fence_driver_init;
3349         }
3350
3351         /* detect if we are with an SRIOV vbios */
3352         amdgpu_device_detect_sriov_bios(adev);
3353
3354         /* check if we need to reset the asic
3355          *  E.g., driver was not cleanly unloaded previously, etc.
3356          */
3357         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3358                 r = amdgpu_asic_reset(adev);
3359                 if (r) {
3360                         dev_err(adev->dev, "asic reset on init failed\n");
3361                         goto failed;
3362                 }
3363         }
3364
3365         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3366
3367         /* Post card if necessary */
3368         if (amdgpu_device_need_post(adev)) {
3369                 if (!adev->bios) {
3370                         dev_err(adev->dev, "no vBIOS found\n");
3371                         r = -EINVAL;
3372                         goto failed;
3373                 }
3374                 DRM_INFO("GPU posting now...\n");
3375                 r = amdgpu_device_asic_init(adev);
3376                 if (r) {
3377                         dev_err(adev->dev, "gpu post error!\n");
3378                         goto failed;
3379                 }
3380         }
3381
3382         if (adev->is_atom_fw) {
3383                 /* Initialize clocks */
3384                 r = amdgpu_atomfirmware_get_clock_info(adev);
3385                 if (r) {
3386                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3387                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3388                         goto failed;
3389                 }
3390         } else {
3391                 /* Initialize clocks */
3392                 r = amdgpu_atombios_get_clock_info(adev);
3393                 if (r) {
3394                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3395                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3396                         goto failed;
3397                 }
3398                 /* init i2c buses */
3399                 if (!amdgpu_device_has_dc_support(adev))
3400                         amdgpu_atombios_i2c_init(adev);
3401         }
3402
3403 fence_driver_init:
3404         /* Fence driver */
3405         r = amdgpu_fence_driver_init(adev);
3406         if (r) {
3407                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3408                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3409                 goto failed;
3410         }
3411
3412         /* init the mode config */
3413         drm_mode_config_init(adev_to_drm(adev));
3414
3415         r = amdgpu_device_ip_init(adev);
3416         if (r) {
3417                 /* failed in exclusive mode due to timeout */
3418                 if (amdgpu_sriov_vf(adev) &&
3419                     !amdgpu_sriov_runtime(adev) &&
3420                     amdgpu_virt_mmio_blocked(adev) &&
3421                     !amdgpu_virt_wait_reset(adev)) {
3422                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3423                         /* Don't send request since VF is inactive. */
3424                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3425                         adev->virt.ops = NULL;
3426                         r = -EAGAIN;
3427                         goto failed;
3428                 }
3429                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3430                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3431                 goto failed;
3432         }
3433
3434         dev_info(adev->dev,
3435                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3436                         adev->gfx.config.max_shader_engines,
3437                         adev->gfx.config.max_sh_per_se,
3438                         adev->gfx.config.max_cu_per_sh,
3439                         adev->gfx.cu_info.number);
3440
3441         adev->accel_working = true;
3442
3443         amdgpu_vm_check_compute_bug(adev);
3444
3445         /* Initialize the buffer migration limit. */
3446         if (amdgpu_moverate >= 0)
3447                 max_MBps = amdgpu_moverate;
3448         else
3449                 max_MBps = 8; /* Allow 8 MB/s. */
3450         /* Get a log2 for easy divisions. */
3451         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3452
3453         amdgpu_fbdev_init(adev);
3454
3455         r = amdgpu_pm_sysfs_init(adev);
3456         if (r) {
3457                 adev->pm_sysfs_en = false;
3458                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3459         } else
3460                 adev->pm_sysfs_en = true;
3461
3462         r = amdgpu_ucode_sysfs_init(adev);
3463         if (r) {
3464                 adev->ucode_sysfs_en = false;
3465                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3466         } else
3467                 adev->ucode_sysfs_en = true;
3468
3469         if ((amdgpu_testing & 1)) {
3470                 if (adev->accel_working)
3471                         amdgpu_test_moves(adev);
3472                 else
3473                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3474         }
3475         if (amdgpu_benchmarking) {
3476                 if (adev->accel_working)
3477                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3478                 else
3479                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3480         }
3481
3482         /*
3483          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3484          * Otherwise the mgpu fan boost feature will be skipped due to the
3485          * gpu instance is counted less.
3486          */
3487         amdgpu_register_gpu_instance(adev);
3488
3489         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3490          * explicit gating rather than handling it automatically.
3491          */
3492         r = amdgpu_device_ip_late_init(adev);
3493         if (r) {
3494                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3495                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3496                 goto failed;
3497         }
3498
3499         /* must succeed. */
3500         amdgpu_ras_resume(adev);
3501
3502         queue_delayed_work(system_wq, &adev->delayed_init_work,
3503                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3504
3505         if (amdgpu_sriov_vf(adev))
3506                 flush_delayed_work(&adev->delayed_init_work);
3507
3508         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3509         if (r)
3510                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3511
3512         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3513                 r = amdgpu_pmu_init(adev);
3514         if (r)
3515                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3516
3517         /* Have stored pci confspace at hand for restore in sudden PCI error */
3518         if (amdgpu_device_cache_pci_state(adev->pdev))
3519                 pci_restore_state(pdev);
3520
3521         return 0;
3522
3523 failed:
3524         amdgpu_vf_error_trans_all(adev);
3525         if (boco)
3526                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3527
3528 failed_unmap:
3529         iounmap(adev->rmmio);
3530         adev->rmmio = NULL;
3531
3532         return r;
3533 }
3534
3535 /**
3536  * amdgpu_device_fini - tear down the driver
3537  *
3538  * @adev: amdgpu_device pointer
3539  *
3540  * Tear down the driver info (all asics).
3541  * Called at driver shutdown.
3542  */
3543 void amdgpu_device_fini(struct amdgpu_device *adev)
3544 {
3545         dev_info(adev->dev, "amdgpu: finishing device.\n");
3546         flush_delayed_work(&adev->delayed_init_work);
3547         adev->shutdown = true;
3548
3549         kfree(adev->pci_state);
3550
3551         /* make sure IB test finished before entering exclusive mode
3552          * to avoid preemption on IB test
3553          * */
3554         if (amdgpu_sriov_vf(adev)) {
3555                 amdgpu_virt_request_full_gpu(adev, false);
3556                 amdgpu_virt_fini_data_exchange(adev);
3557         }
3558
3559         /* disable all interrupts */
3560         amdgpu_irq_disable_all(adev);
3561         if (adev->mode_info.mode_config_initialized){
3562                 if (!amdgpu_device_has_dc_support(adev))
3563                         drm_helper_force_disable_all(adev_to_drm(adev));
3564                 else
3565                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3566         }
3567         amdgpu_fence_driver_fini(adev);
3568         if (adev->pm_sysfs_en)
3569                 amdgpu_pm_sysfs_fini(adev);
3570         amdgpu_fbdev_fini(adev);
3571         amdgpu_device_ip_fini(adev);
3572         release_firmware(adev->firmware.gpu_info_fw);
3573         adev->firmware.gpu_info_fw = NULL;
3574         adev->accel_working = false;
3575         /* free i2c buses */
3576         if (!amdgpu_device_has_dc_support(adev))
3577                 amdgpu_i2c_fini(adev);
3578
3579         if (amdgpu_emu_mode != 1)
3580                 amdgpu_atombios_fini(adev);
3581
3582         kfree(adev->bios);
3583         adev->bios = NULL;
3584         if (amdgpu_has_atpx() &&
3585             (amdgpu_is_atpx_hybrid() ||
3586              amdgpu_has_atpx_dgpu_power_cntl()) &&
3587             !pci_is_thunderbolt_attached(adev->pdev))
3588                 vga_switcheroo_unregister_client(adev->pdev);
3589         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3590                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3591         vga_client_register(adev->pdev, NULL, NULL, NULL);
3592         if (adev->rio_mem)
3593                 pci_iounmap(adev->pdev, adev->rio_mem);
3594         adev->rio_mem = NULL;
3595         iounmap(adev->rmmio);
3596         adev->rmmio = NULL;
3597         amdgpu_device_doorbell_fini(adev);
3598
3599         if (adev->ucode_sysfs_en)
3600                 amdgpu_ucode_sysfs_fini(adev);
3601
3602         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3603         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3604                 amdgpu_pmu_fini(adev);
3605         if (adev->mman.discovery_bin)
3606                 amdgpu_discovery_fini(adev);
3607 }
3608
3609
3610 /*
3611  * Suspend & resume.
3612  */
3613 /**
3614  * amdgpu_device_suspend - initiate device suspend
3615  *
3616  * @dev: drm dev pointer
3617  * @fbcon : notify the fbdev of suspend
3618  *
3619  * Puts the hw in the suspend state (all asics).
3620  * Returns 0 for success or an error on failure.
3621  * Called at driver suspend.
3622  */
3623 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3624 {
3625         struct amdgpu_device *adev;
3626         struct drm_crtc *crtc;
3627         struct drm_connector *connector;
3628         struct drm_connector_list_iter iter;
3629         int r;
3630
3631         adev = drm_to_adev(dev);
3632
3633         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3634                 return 0;
3635
3636         adev->in_suspend = true;
3637         drm_kms_helper_poll_disable(dev);
3638
3639         if (fbcon)
3640                 amdgpu_fbdev_set_suspend(adev, 1);
3641
3642         cancel_delayed_work_sync(&adev->delayed_init_work);
3643
3644         if (!amdgpu_device_has_dc_support(adev)) {
3645                 /* turn off display hw */
3646                 drm_modeset_lock_all(dev);
3647                 drm_connector_list_iter_begin(dev, &iter);
3648                 drm_for_each_connector_iter(connector, &iter)
3649                         drm_helper_connector_dpms(connector,
3650                                                   DRM_MODE_DPMS_OFF);
3651                 drm_connector_list_iter_end(&iter);
3652                 drm_modeset_unlock_all(dev);
3653                         /* unpin the front buffers and cursors */
3654                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3655                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3656                         struct drm_framebuffer *fb = crtc->primary->fb;
3657                         struct amdgpu_bo *robj;
3658
3659                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3660                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3661                                 r = amdgpu_bo_reserve(aobj, true);
3662                                 if (r == 0) {
3663                                         amdgpu_bo_unpin(aobj);
3664                                         amdgpu_bo_unreserve(aobj);
3665                                 }
3666                         }
3667
3668                         if (fb == NULL || fb->obj[0] == NULL) {
3669                                 continue;
3670                         }
3671                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3672                         /* don't unpin kernel fb objects */
3673                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3674                                 r = amdgpu_bo_reserve(robj, true);
3675                                 if (r == 0) {
3676                                         amdgpu_bo_unpin(robj);
3677                                         amdgpu_bo_unreserve(robj);
3678                                 }
3679                         }
3680                 }
3681         }
3682
3683         amdgpu_ras_suspend(adev);
3684
3685         r = amdgpu_device_ip_suspend_phase1(adev);
3686
3687         amdgpu_amdkfd_suspend(adev, !fbcon);
3688
3689         /* evict vram memory */
3690         amdgpu_bo_evict_vram(adev);
3691
3692         amdgpu_fence_driver_suspend(adev);
3693
3694         r = amdgpu_device_ip_suspend_phase2(adev);
3695
3696         /* evict remaining vram memory
3697          * This second call to evict vram is to evict the gart page table
3698          * using the CPU.
3699          */
3700         amdgpu_bo_evict_vram(adev);
3701
3702         return 0;
3703 }
3704
3705 /**
3706  * amdgpu_device_resume - initiate device resume
3707  *
3708  * @dev: drm dev pointer
3709  * @fbcon : notify the fbdev of resume
3710  *
3711  * Bring the hw back to operating state (all asics).
3712  * Returns 0 for success or an error on failure.
3713  * Called at driver resume.
3714  */
3715 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3716 {
3717         struct drm_connector *connector;
3718         struct drm_connector_list_iter iter;
3719         struct amdgpu_device *adev = drm_to_adev(dev);
3720         struct drm_crtc *crtc;
3721         int r = 0;
3722
3723         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3724                 return 0;
3725
3726         /* post card */
3727         if (amdgpu_device_need_post(adev)) {
3728                 r = amdgpu_device_asic_init(adev);
3729                 if (r)
3730                         dev_err(adev->dev, "amdgpu asic init failed\n");
3731         }
3732
3733         r = amdgpu_device_ip_resume(adev);
3734         if (r) {
3735                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3736                 return r;
3737         }
3738         amdgpu_fence_driver_resume(adev);
3739
3740
3741         r = amdgpu_device_ip_late_init(adev);
3742         if (r)
3743                 return r;
3744
3745         queue_delayed_work(system_wq, &adev->delayed_init_work,
3746                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3747
3748         if (!amdgpu_device_has_dc_support(adev)) {
3749                 /* pin cursors */
3750                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3751                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3752
3753                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3754                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3755                                 r = amdgpu_bo_reserve(aobj, true);
3756                                 if (r == 0) {
3757                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3758                                         if (r != 0)
3759                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3760                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3761                                         amdgpu_bo_unreserve(aobj);
3762                                 }
3763                         }
3764                 }
3765         }
3766         r = amdgpu_amdkfd_resume(adev, !fbcon);
3767         if (r)
3768                 return r;
3769
3770         /* Make sure IB tests flushed */
3771         flush_delayed_work(&adev->delayed_init_work);
3772
3773         /* blat the mode back in */
3774         if (fbcon) {
3775                 if (!amdgpu_device_has_dc_support(adev)) {
3776                         /* pre DCE11 */
3777                         drm_helper_resume_force_mode(dev);
3778
3779                         /* turn on display hw */
3780                         drm_modeset_lock_all(dev);
3781
3782                         drm_connector_list_iter_begin(dev, &iter);
3783                         drm_for_each_connector_iter(connector, &iter)
3784                                 drm_helper_connector_dpms(connector,
3785                                                           DRM_MODE_DPMS_ON);
3786                         drm_connector_list_iter_end(&iter);
3787
3788                         drm_modeset_unlock_all(dev);
3789                 }
3790                 amdgpu_fbdev_set_suspend(adev, 0);
3791         }
3792
3793         drm_kms_helper_poll_enable(dev);
3794
3795         amdgpu_ras_resume(adev);
3796
3797         /*
3798          * Most of the connector probing functions try to acquire runtime pm
3799          * refs to ensure that the GPU is powered on when connector polling is
3800          * performed. Since we're calling this from a runtime PM callback,
3801          * trying to acquire rpm refs will cause us to deadlock.
3802          *
3803          * Since we're guaranteed to be holding the rpm lock, it's safe to
3804          * temporarily disable the rpm helpers so this doesn't deadlock us.
3805          */
3806 #ifdef CONFIG_PM
3807         dev->dev->power.disable_depth++;
3808 #endif
3809         if (!amdgpu_device_has_dc_support(adev))
3810                 drm_helper_hpd_irq_event(dev);
3811         else
3812                 drm_kms_helper_hotplug_event(dev);
3813 #ifdef CONFIG_PM
3814         dev->dev->power.disable_depth--;
3815 #endif
3816         adev->in_suspend = false;
3817
3818         return 0;
3819 }
3820
3821 /**
3822  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3823  *
3824  * @adev: amdgpu_device pointer
3825  *
3826  * The list of all the hardware IPs that make up the asic is walked and
3827  * the check_soft_reset callbacks are run.  check_soft_reset determines
3828  * if the asic is still hung or not.
3829  * Returns true if any of the IPs are still in a hung state, false if not.
3830  */
3831 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3832 {
3833         int i;
3834         bool asic_hang = false;
3835
3836         if (amdgpu_sriov_vf(adev))
3837                 return true;
3838
3839         if (amdgpu_asic_need_full_reset(adev))
3840                 return true;
3841
3842         for (i = 0; i < adev->num_ip_blocks; i++) {
3843                 if (!adev->ip_blocks[i].status.valid)
3844                         continue;
3845                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3846                         adev->ip_blocks[i].status.hang =
3847                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3848                 if (adev->ip_blocks[i].status.hang) {
3849                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3850                         asic_hang = true;
3851                 }
3852         }
3853         return asic_hang;
3854 }
3855
3856 /**
3857  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3858  *
3859  * @adev: amdgpu_device pointer
3860  *
3861  * The list of all the hardware IPs that make up the asic is walked and the
3862  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3863  * handles any IP specific hardware or software state changes that are
3864  * necessary for a soft reset to succeed.
3865  * Returns 0 on success, negative error code on failure.
3866  */
3867 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3868 {
3869         int i, r = 0;
3870
3871         for (i = 0; i < adev->num_ip_blocks; i++) {
3872                 if (!adev->ip_blocks[i].status.valid)
3873                         continue;
3874                 if (adev->ip_blocks[i].status.hang &&
3875                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3876                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3877                         if (r)
3878                                 return r;
3879                 }
3880         }
3881
3882         return 0;
3883 }
3884
3885 /**
3886  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3887  *
3888  * @adev: amdgpu_device pointer
3889  *
3890  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3891  * reset is necessary to recover.
3892  * Returns true if a full asic reset is required, false if not.
3893  */
3894 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3895 {
3896         int i;
3897
3898         if (amdgpu_asic_need_full_reset(adev))
3899                 return true;
3900
3901         for (i = 0; i < adev->num_ip_blocks; i++) {
3902                 if (!adev->ip_blocks[i].status.valid)
3903                         continue;
3904                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3905                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3906                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3907                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3908                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3909                         if (adev->ip_blocks[i].status.hang) {
3910                                 dev_info(adev->dev, "Some block need full reset!\n");
3911                                 return true;
3912                         }
3913                 }
3914         }
3915         return false;
3916 }
3917
3918 /**
3919  * amdgpu_device_ip_soft_reset - do a soft reset
3920  *
3921  * @adev: amdgpu_device pointer
3922  *
3923  * The list of all the hardware IPs that make up the asic is walked and the
3924  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3925  * IP specific hardware or software state changes that are necessary to soft
3926  * reset the IP.
3927  * Returns 0 on success, negative error code on failure.
3928  */
3929 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3930 {
3931         int i, r = 0;
3932
3933         for (i = 0; i < adev->num_ip_blocks; i++) {
3934                 if (!adev->ip_blocks[i].status.valid)
3935                         continue;
3936                 if (adev->ip_blocks[i].status.hang &&
3937                     adev->ip_blocks[i].version->funcs->soft_reset) {
3938                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3939                         if (r)
3940                                 return r;
3941                 }
3942         }
3943
3944         return 0;
3945 }
3946
3947 /**
3948  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3949  *
3950  * @adev: amdgpu_device pointer
3951  *
3952  * The list of all the hardware IPs that make up the asic is walked and the
3953  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3954  * handles any IP specific hardware or software state changes that are
3955  * necessary after the IP has been soft reset.
3956  * Returns 0 on success, negative error code on failure.
3957  */
3958 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3959 {
3960         int i, r = 0;
3961
3962         for (i = 0; i < adev->num_ip_blocks; i++) {
3963                 if (!adev->ip_blocks[i].status.valid)
3964                         continue;
3965                 if (adev->ip_blocks[i].status.hang &&
3966                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3967                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3968                 if (r)
3969                         return r;
3970         }
3971
3972         return 0;
3973 }
3974
3975 /**
3976  * amdgpu_device_recover_vram - Recover some VRAM contents
3977  *
3978  * @adev: amdgpu_device pointer
3979  *
3980  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3981  * restore things like GPUVM page tables after a GPU reset where
3982  * the contents of VRAM might be lost.
3983  *
3984  * Returns:
3985  * 0 on success, negative error code on failure.
3986  */
3987 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3988 {
3989         struct dma_fence *fence = NULL, *next = NULL;
3990         struct amdgpu_bo *shadow;
3991         long r = 1, tmo;
3992
3993         if (amdgpu_sriov_runtime(adev))
3994                 tmo = msecs_to_jiffies(8000);
3995         else
3996                 tmo = msecs_to_jiffies(100);
3997
3998         dev_info(adev->dev, "recover vram bo from shadow start\n");
3999         mutex_lock(&adev->shadow_list_lock);
4000         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4001
4002                 /* No need to recover an evicted BO */
4003                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4004                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4005                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4006                         continue;
4007
4008                 r = amdgpu_bo_restore_shadow(shadow, &next);
4009                 if (r)
4010                         break;
4011
4012                 if (fence) {
4013                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4014                         dma_fence_put(fence);
4015                         fence = next;
4016                         if (tmo == 0) {
4017                                 r = -ETIMEDOUT;
4018                                 break;
4019                         } else if (tmo < 0) {
4020                                 r = tmo;
4021                                 break;
4022                         }
4023                 } else {
4024                         fence = next;
4025                 }
4026         }
4027         mutex_unlock(&adev->shadow_list_lock);
4028
4029         if (fence)
4030                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4031         dma_fence_put(fence);
4032
4033         if (r < 0 || tmo <= 0) {
4034                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4035                 return -EIO;
4036         }
4037
4038         dev_info(adev->dev, "recover vram bo from shadow done\n");
4039         return 0;
4040 }
4041
4042
4043 /**
4044  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4045  *
4046  * @adev: amdgpu device pointer
4047  * @from_hypervisor: request from hypervisor
4048  *
4049  * do VF FLR and reinitialize Asic
4050  * return 0 means succeeded otherwise failed
4051  */
4052 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4053                                      bool from_hypervisor)
4054 {
4055         int r;
4056
4057         if (from_hypervisor)
4058                 r = amdgpu_virt_request_full_gpu(adev, true);
4059         else
4060                 r = amdgpu_virt_reset_gpu(adev);
4061         if (r)
4062                 return r;
4063
4064         amdgpu_amdkfd_pre_reset(adev);
4065
4066         /* Resume IP prior to SMC */
4067         r = amdgpu_device_ip_reinit_early_sriov(adev);
4068         if (r)
4069                 goto error;
4070
4071         amdgpu_virt_init_data_exchange(adev);
4072         /* we need recover gart prior to run SMC/CP/SDMA resume */
4073         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4074
4075         r = amdgpu_device_fw_loading(adev);
4076         if (r)
4077                 return r;
4078
4079         /* now we are okay to resume SMC/CP/SDMA */
4080         r = amdgpu_device_ip_reinit_late_sriov(adev);
4081         if (r)
4082                 goto error;
4083
4084         amdgpu_irq_gpu_reset_resume_helper(adev);
4085         r = amdgpu_ib_ring_tests(adev);
4086         amdgpu_amdkfd_post_reset(adev);
4087
4088 error:
4089         amdgpu_virt_release_full_gpu(adev, true);
4090         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4091                 amdgpu_inc_vram_lost(adev);
4092                 r = amdgpu_device_recover_vram(adev);
4093         }
4094
4095         return r;
4096 }
4097
4098 /**
4099  * amdgpu_device_has_job_running - check if there is any job in mirror list
4100  *
4101  * @adev: amdgpu device pointer
4102  *
4103  * check if there is any job in mirror list
4104  */
4105 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4106 {
4107         int i;
4108         struct drm_sched_job *job;
4109
4110         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4111                 struct amdgpu_ring *ring = adev->rings[i];
4112
4113                 if (!ring || !ring->sched.thread)
4114                         continue;
4115
4116                 spin_lock(&ring->sched.job_list_lock);
4117                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4118                                 struct drm_sched_job, node);
4119                 spin_unlock(&ring->sched.job_list_lock);
4120                 if (job)
4121                         return true;
4122         }
4123         return false;
4124 }
4125
4126 /**
4127  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4128  *
4129  * @adev: amdgpu device pointer
4130  *
4131  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4132  * a hung GPU.
4133  */
4134 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4135 {
4136         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4137                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4138                 return false;
4139         }
4140
4141         if (amdgpu_gpu_recovery == 0)
4142                 goto disabled;
4143
4144         if (amdgpu_sriov_vf(adev))
4145                 return true;
4146
4147         if (amdgpu_gpu_recovery == -1) {
4148                 switch (adev->asic_type) {
4149                 case CHIP_BONAIRE:
4150                 case CHIP_HAWAII:
4151                 case CHIP_TOPAZ:
4152                 case CHIP_TONGA:
4153                 case CHIP_FIJI:
4154                 case CHIP_POLARIS10:
4155                 case CHIP_POLARIS11:
4156                 case CHIP_POLARIS12:
4157                 case CHIP_VEGAM:
4158                 case CHIP_VEGA20:
4159                 case CHIP_VEGA10:
4160                 case CHIP_VEGA12:
4161                 case CHIP_RAVEN:
4162                 case CHIP_ARCTURUS:
4163                 case CHIP_RENOIR:
4164                 case CHIP_NAVI10:
4165                 case CHIP_NAVI14:
4166                 case CHIP_NAVI12:
4167                 case CHIP_SIENNA_CICHLID:
4168                         break;
4169                 default:
4170                         goto disabled;
4171                 }
4172         }
4173
4174         return true;
4175
4176 disabled:
4177                 dev_info(adev->dev, "GPU recovery disabled.\n");
4178                 return false;
4179 }
4180
4181
4182 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4183                                         struct amdgpu_job *job,
4184                                         bool *need_full_reset_arg)
4185 {
4186         int i, r = 0;
4187         bool need_full_reset  = *need_full_reset_arg;
4188
4189         amdgpu_debugfs_wait_dump(adev);
4190
4191         if (amdgpu_sriov_vf(adev)) {
4192                 /* stop the data exchange thread */
4193                 amdgpu_virt_fini_data_exchange(adev);
4194         }
4195
4196         /* block all schedulers and reset given job's ring */
4197         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4198                 struct amdgpu_ring *ring = adev->rings[i];
4199
4200                 if (!ring || !ring->sched.thread)
4201                         continue;
4202
4203                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4204                 amdgpu_fence_driver_force_completion(ring);
4205         }
4206
4207         if(job)
4208                 drm_sched_increase_karma(&job->base);
4209
4210         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4211         if (!amdgpu_sriov_vf(adev)) {
4212
4213                 if (!need_full_reset)
4214                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4215
4216                 if (!need_full_reset) {
4217                         amdgpu_device_ip_pre_soft_reset(adev);
4218                         r = amdgpu_device_ip_soft_reset(adev);
4219                         amdgpu_device_ip_post_soft_reset(adev);
4220                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4221                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4222                                 need_full_reset = true;
4223                         }
4224                 }
4225
4226                 if (need_full_reset)
4227                         r = amdgpu_device_ip_suspend(adev);
4228
4229                 *need_full_reset_arg = need_full_reset;
4230         }
4231
4232         return r;
4233 }
4234
4235 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4236                                struct list_head *device_list_handle,
4237                                bool *need_full_reset_arg,
4238                                bool skip_hw_reset)
4239 {
4240         struct amdgpu_device *tmp_adev = NULL;
4241         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4242         int r = 0;
4243
4244         /*
4245          * ASIC reset has to be done on all HGMI hive nodes ASAP
4246          * to allow proper links negotiation in FW (within 1 sec)
4247          */
4248         if (!skip_hw_reset && need_full_reset) {
4249                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4250                         /* For XGMI run all resets in parallel to speed up the process */
4251                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4252                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4253                                         r = -EALREADY;
4254                         } else
4255                                 r = amdgpu_asic_reset(tmp_adev);
4256
4257                         if (r) {
4258                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4259                                          r, adev_to_drm(tmp_adev)->unique);
4260                                 break;
4261                         }
4262                 }
4263
4264                 /* For XGMI wait for all resets to complete before proceed */
4265                 if (!r) {
4266                         list_for_each_entry(tmp_adev, device_list_handle,
4267                                             gmc.xgmi.head) {
4268                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4269                                         flush_work(&tmp_adev->xgmi_reset_work);
4270                                         r = tmp_adev->asic_reset_res;
4271                                         if (r)
4272                                                 break;
4273                                 }
4274                         }
4275                 }
4276         }
4277
4278         if (!r && amdgpu_ras_intr_triggered()) {
4279                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4280                         if (tmp_adev->mmhub.funcs &&
4281                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4282                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4283                 }
4284
4285                 amdgpu_ras_intr_cleared();
4286         }
4287
4288         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4289                 if (need_full_reset) {
4290                         /* post card */
4291                         if (amdgpu_device_asic_init(tmp_adev))
4292                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4293
4294                         if (!r) {
4295                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4296                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4297                                 if (r)
4298                                         goto out;
4299
4300                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4301                                 if (vram_lost) {
4302                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4303                                         amdgpu_inc_vram_lost(tmp_adev);
4304                                 }
4305
4306                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4307                                 if (r)
4308                                         goto out;
4309
4310                                 r = amdgpu_device_fw_loading(tmp_adev);
4311                                 if (r)
4312                                         return r;
4313
4314                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4315                                 if (r)
4316                                         goto out;
4317
4318                                 if (vram_lost)
4319                                         amdgpu_device_fill_reset_magic(tmp_adev);
4320
4321                                 /*
4322                                  * Add this ASIC as tracked as reset was already
4323                                  * complete successfully.
4324                                  */
4325                                 amdgpu_register_gpu_instance(tmp_adev);
4326
4327                                 r = amdgpu_device_ip_late_init(tmp_adev);
4328                                 if (r)
4329                                         goto out;
4330
4331                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4332
4333                                 /*
4334                                  * The GPU enters bad state once faulty pages
4335                                  * by ECC has reached the threshold, and ras
4336                                  * recovery is scheduled next. So add one check
4337                                  * here to break recovery if it indeed exceeds
4338                                  * bad page threshold, and remind user to
4339                                  * retire this GPU or setting one bigger
4340                                  * bad_page_threshold value to fix this once
4341                                  * probing driver again.
4342                                  */
4343                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4344                                         /* must succeed. */
4345                                         amdgpu_ras_resume(tmp_adev);
4346                                 } else {
4347                                         r = -EINVAL;
4348                                         goto out;
4349                                 }
4350
4351                                 /* Update PSP FW topology after reset */
4352                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4353                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4354                         }
4355                 }
4356
4357 out:
4358                 if (!r) {
4359                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4360                         r = amdgpu_ib_ring_tests(tmp_adev);
4361                         if (r) {
4362                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4363                                 r = amdgpu_device_ip_suspend(tmp_adev);
4364                                 need_full_reset = true;
4365                                 r = -EAGAIN;
4366                                 goto end;
4367                         }
4368                 }
4369
4370                 if (!r)
4371                         r = amdgpu_device_recover_vram(tmp_adev);
4372                 else
4373                         tmp_adev->asic_reset_res = r;
4374         }
4375
4376 end:
4377         *need_full_reset_arg = need_full_reset;
4378         return r;
4379 }
4380
4381 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4382                                 struct amdgpu_hive_info *hive)
4383 {
4384         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4385                 return false;
4386
4387         if (hive) {
4388                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4389         } else {
4390                 down_write(&adev->reset_sem);
4391         }
4392
4393         atomic_inc(&adev->gpu_reset_counter);
4394         switch (amdgpu_asic_reset_method(adev)) {
4395         case AMD_RESET_METHOD_MODE1:
4396                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4397                 break;
4398         case AMD_RESET_METHOD_MODE2:
4399                 adev->mp1_state = PP_MP1_STATE_RESET;
4400                 break;
4401         default:
4402                 adev->mp1_state = PP_MP1_STATE_NONE;
4403                 break;
4404         }
4405
4406         return true;
4407 }
4408
4409 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4410 {
4411         amdgpu_vf_error_trans_all(adev);
4412         adev->mp1_state = PP_MP1_STATE_NONE;
4413         atomic_set(&adev->in_gpu_reset, 0);
4414         up_write(&adev->reset_sem);
4415 }
4416
4417 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4418 {
4419         struct pci_dev *p = NULL;
4420
4421         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4422                         adev->pdev->bus->number, 1);
4423         if (p) {
4424                 pm_runtime_enable(&(p->dev));
4425                 pm_runtime_resume(&(p->dev));
4426         }
4427 }
4428
4429 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4430 {
4431         enum amd_reset_method reset_method;
4432         struct pci_dev *p = NULL;
4433         u64 expires;
4434
4435         /*
4436          * For now, only BACO and mode1 reset are confirmed
4437          * to suffer the audio issue without proper suspended.
4438          */
4439         reset_method = amdgpu_asic_reset_method(adev);
4440         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4441              (reset_method != AMD_RESET_METHOD_MODE1))
4442                 return -EINVAL;
4443
4444         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4445                         adev->pdev->bus->number, 1);
4446         if (!p)
4447                 return -ENODEV;
4448
4449         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4450         if (!expires)
4451                 /*
4452                  * If we cannot get the audio device autosuspend delay,
4453                  * a fixed 4S interval will be used. Considering 3S is
4454                  * the audio controller default autosuspend delay setting.
4455                  * 4S used here is guaranteed to cover that.
4456                  */
4457                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4458
4459         while (!pm_runtime_status_suspended(&(p->dev))) {
4460                 if (!pm_runtime_suspend(&(p->dev)))
4461                         break;
4462
4463                 if (expires < ktime_get_mono_fast_ns()) {
4464                         dev_warn(adev->dev, "failed to suspend display audio\n");
4465                         /* TODO: abort the succeeding gpu reset? */
4466                         return -ETIMEDOUT;
4467                 }
4468         }
4469
4470         pm_runtime_disable(&(p->dev));
4471
4472         return 0;
4473 }
4474
4475 /**
4476  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4477  *
4478  * @adev: amdgpu device pointer
4479  * @job: which job trigger hang
4480  *
4481  * Attempt to reset the GPU if it has hung (all asics).
4482  * Attempt to do soft-reset or full-reset and reinitialize Asic
4483  * Returns 0 for success or an error on failure.
4484  */
4485
4486 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4487                               struct amdgpu_job *job)
4488 {
4489         struct list_head device_list, *device_list_handle =  NULL;
4490         bool need_full_reset = false;
4491         bool job_signaled = false;
4492         struct amdgpu_hive_info *hive = NULL;
4493         struct amdgpu_device *tmp_adev = NULL;
4494         int i, r = 0;
4495         bool need_emergency_restart = false;
4496         bool audio_suspended = false;
4497
4498         /**
4499          * Special case: RAS triggered and full reset isn't supported
4500          */
4501         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4502
4503         /*
4504          * Flush RAM to disk so that after reboot
4505          * the user can read log and see why the system rebooted.
4506          */
4507         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4508                 DRM_WARN("Emergency reboot.");
4509
4510                 ksys_sync_helper();
4511                 emergency_restart();
4512         }
4513
4514         dev_info(adev->dev, "GPU %s begin!\n",
4515                 need_emergency_restart ? "jobs stop":"reset");
4516
4517         /*
4518          * Here we trylock to avoid chain of resets executing from
4519          * either trigger by jobs on different adevs in XGMI hive or jobs on
4520          * different schedulers for same device while this TO handler is running.
4521          * We always reset all schedulers for device and all devices for XGMI
4522          * hive so that should take care of them too.
4523          */
4524         hive = amdgpu_get_xgmi_hive(adev);
4525         if (hive) {
4526                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4527                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4528                                 job ? job->base.id : -1, hive->hive_id);
4529                         amdgpu_put_xgmi_hive(hive);
4530                         return 0;
4531                 }
4532                 mutex_lock(&hive->hive_lock);
4533         }
4534
4535         /*
4536          * Build list of devices to reset.
4537          * In case we are in XGMI hive mode, resort the device list
4538          * to put adev in the 1st position.
4539          */
4540         INIT_LIST_HEAD(&device_list);
4541         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4542                 if (!hive)
4543                         return -ENODEV;
4544                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4545                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4546                 device_list_handle = &hive->device_list;
4547         } else {
4548                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4549                 device_list_handle = &device_list;
4550         }
4551
4552         /* block all schedulers and reset given job's ring */
4553         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4554                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4555                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4556                                   job ? job->base.id : -1);
4557                         r = 0;
4558                         goto skip_recovery;
4559                 }
4560
4561                 /*
4562                  * Try to put the audio codec into suspend state
4563                  * before gpu reset started.
4564                  *
4565                  * Due to the power domain of the graphics device
4566                  * is shared with AZ power domain. Without this,
4567                  * we may change the audio hardware from behind
4568                  * the audio driver's back. That will trigger
4569                  * some audio codec errors.
4570                  */
4571                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4572                         audio_suspended = true;
4573
4574                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4575
4576                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4577
4578                 if (!amdgpu_sriov_vf(tmp_adev))
4579                         amdgpu_amdkfd_pre_reset(tmp_adev);
4580
4581                 /*
4582                  * Mark these ASICs to be reseted as untracked first
4583                  * And add them back after reset completed
4584                  */
4585                 amdgpu_unregister_gpu_instance(tmp_adev);
4586
4587                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4588
4589                 /* disable ras on ALL IPs */
4590                 if (!need_emergency_restart &&
4591                       amdgpu_device_ip_need_full_reset(tmp_adev))
4592                         amdgpu_ras_suspend(tmp_adev);
4593
4594                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4595                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4596
4597                         if (!ring || !ring->sched.thread)
4598                                 continue;
4599
4600                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4601
4602                         if (need_emergency_restart)
4603                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4604                 }
4605         }
4606
4607         if (need_emergency_restart)
4608                 goto skip_sched_resume;
4609
4610         /*
4611          * Must check guilty signal here since after this point all old
4612          * HW fences are force signaled.
4613          *
4614          * job->base holds a reference to parent fence
4615          */
4616         if (job && job->base.s_fence->parent &&
4617             dma_fence_is_signaled(job->base.s_fence->parent)) {
4618                 job_signaled = true;
4619                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4620                 goto skip_hw_reset;
4621         }
4622
4623 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4624         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4625                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4626                                                  NULL,
4627                                                  &need_full_reset);
4628                 /*TODO Should we stop ?*/
4629                 if (r) {
4630                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4631                                   r, adev_to_drm(tmp_adev)->unique);
4632                         tmp_adev->asic_reset_res = r;
4633                 }
4634         }
4635
4636         /* Actual ASIC resets if needed.*/
4637         /* TODO Implement XGMI hive reset logic for SRIOV */
4638         if (amdgpu_sriov_vf(adev)) {
4639                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4640                 if (r)
4641                         adev->asic_reset_res = r;
4642         } else {
4643                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4644                 if (r && r == -EAGAIN)
4645                         goto retry;
4646         }
4647
4648 skip_hw_reset:
4649
4650         /* Post ASIC reset for all devs .*/
4651         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4652
4653                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4654                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4655
4656                         if (!ring || !ring->sched.thread)
4657                                 continue;
4658
4659                         /* No point to resubmit jobs if we didn't HW reset*/
4660                         if (!tmp_adev->asic_reset_res && !job_signaled)
4661                                 drm_sched_resubmit_jobs(&ring->sched);
4662
4663                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4664                 }
4665
4666                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4667                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4668                 }
4669
4670                 tmp_adev->asic_reset_res = 0;
4671
4672                 if (r) {
4673                         /* bad news, how to tell it to userspace ? */
4674                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4675                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4676                 } else {
4677                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4678                 }
4679         }
4680
4681 skip_sched_resume:
4682         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4683                 /*unlock kfd: SRIOV would do it separately */
4684                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4685                         amdgpu_amdkfd_post_reset(tmp_adev);
4686                 if (audio_suspended)
4687                         amdgpu_device_resume_display_audio(tmp_adev);
4688                 amdgpu_device_unlock_adev(tmp_adev);
4689         }
4690
4691 skip_recovery:
4692         if (hive) {
4693                 atomic_set(&hive->in_reset, 0);
4694                 mutex_unlock(&hive->hive_lock);
4695                 amdgpu_put_xgmi_hive(hive);
4696         }
4697
4698         if (r)
4699                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4700         return r;
4701 }
4702
4703 /**
4704  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4705  *
4706  * @adev: amdgpu_device pointer
4707  *
4708  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4709  * and lanes) of the slot the device is in. Handles APUs and
4710  * virtualized environments where PCIE config space may not be available.
4711  */
4712 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4713 {
4714         struct pci_dev *pdev;
4715         enum pci_bus_speed speed_cap, platform_speed_cap;
4716         enum pcie_link_width platform_link_width;
4717
4718         if (amdgpu_pcie_gen_cap)
4719                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4720
4721         if (amdgpu_pcie_lane_cap)
4722                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4723
4724         /* covers APUs as well */
4725         if (pci_is_root_bus(adev->pdev->bus)) {
4726                 if (adev->pm.pcie_gen_mask == 0)
4727                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4728                 if (adev->pm.pcie_mlw_mask == 0)
4729                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4730                 return;
4731         }
4732
4733         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4734                 return;
4735
4736         pcie_bandwidth_available(adev->pdev, NULL,
4737                                  &platform_speed_cap, &platform_link_width);
4738
4739         if (adev->pm.pcie_gen_mask == 0) {
4740                 /* asic caps */
4741                 pdev = adev->pdev;
4742                 speed_cap = pcie_get_speed_cap(pdev);
4743                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4744                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4745                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4746                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4747                 } else {
4748                         if (speed_cap == PCIE_SPEED_16_0GT)
4749                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4750                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4751                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4752                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4753                         else if (speed_cap == PCIE_SPEED_8_0GT)
4754                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4755                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4756                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4757                         else if (speed_cap == PCIE_SPEED_5_0GT)
4758                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4759                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4760                         else
4761                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4762                 }
4763                 /* platform caps */
4764                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4765                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4766                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4767                 } else {
4768                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4769                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4770                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4771                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4772                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4773                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4774                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4775                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4776                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4777                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4778                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4779                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4780                         else
4781                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4782
4783                 }
4784         }
4785         if (adev->pm.pcie_mlw_mask == 0) {
4786                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4787                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4788                 } else {
4789                         switch (platform_link_width) {
4790                         case PCIE_LNK_X32:
4791                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4792                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4793                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4794                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4795                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4796                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4797                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4798                                 break;
4799                         case PCIE_LNK_X16:
4800                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4801                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4802                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4803                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4804                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4805                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4806                                 break;
4807                         case PCIE_LNK_X12:
4808                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4809                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4810                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4811                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4812                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4813                                 break;
4814                         case PCIE_LNK_X8:
4815                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4817                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4818                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4819                                 break;
4820                         case PCIE_LNK_X4:
4821                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4822                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4823                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4824                                 break;
4825                         case PCIE_LNK_X2:
4826                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4827                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4828                                 break;
4829                         case PCIE_LNK_X1:
4830                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4831                                 break;
4832                         default:
4833                                 break;
4834                         }
4835                 }
4836         }
4837 }
4838
4839 int amdgpu_device_baco_enter(struct drm_device *dev)
4840 {
4841         struct amdgpu_device *adev = drm_to_adev(dev);
4842         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4843
4844         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4845                 return -ENOTSUPP;
4846
4847         if (ras && ras->supported)
4848                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4849
4850         return amdgpu_dpm_baco_enter(adev);
4851 }
4852
4853 int amdgpu_device_baco_exit(struct drm_device *dev)
4854 {
4855         struct amdgpu_device *adev = drm_to_adev(dev);
4856         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4857         int ret = 0;
4858
4859         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4860                 return -ENOTSUPP;
4861
4862         ret = amdgpu_dpm_baco_exit(adev);
4863         if (ret)
4864                 return ret;
4865
4866         if (ras && ras->supported)
4867                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4868
4869         return 0;
4870 }
4871
4872 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4873 {
4874         int i;
4875
4876         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4877                 struct amdgpu_ring *ring = adev->rings[i];
4878
4879                 if (!ring || !ring->sched.thread)
4880                         continue;
4881
4882                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4883         }
4884 }
4885
4886 /**
4887  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4888  * @pdev: PCI device struct
4889  * @state: PCI channel state
4890  *
4891  * Description: Called when a PCI error is detected.
4892  *
4893  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4894  */
4895 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4896 {
4897         struct drm_device *dev = pci_get_drvdata(pdev);
4898         struct amdgpu_device *adev = drm_to_adev(dev);
4899         int i;
4900
4901         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4902
4903         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4904                 DRM_WARN("No support for XGMI hive yet...");
4905                 return PCI_ERS_RESULT_DISCONNECT;
4906         }
4907
4908         switch (state) {
4909         case pci_channel_io_normal:
4910                 return PCI_ERS_RESULT_CAN_RECOVER;
4911         /* Fatal error, prepare for slot reset */
4912         case pci_channel_io_frozen:             
4913                 /*              
4914                  * Cancel and wait for all TDRs in progress if failing to
4915                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4916                  *
4917                  * Locking adev->reset_sem will prevent any external access
4918                  * to GPU during PCI error recovery
4919                  */
4920                 while (!amdgpu_device_lock_adev(adev, NULL))
4921                         amdgpu_cancel_all_tdr(adev);
4922
4923                 /*
4924                  * Block any work scheduling as we do for regular GPU reset
4925                  * for the duration of the recovery
4926                  */
4927                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4928                         struct amdgpu_ring *ring = adev->rings[i];
4929
4930                         if (!ring || !ring->sched.thread)
4931                                 continue;
4932
4933                         drm_sched_stop(&ring->sched, NULL);
4934                 }
4935                 return PCI_ERS_RESULT_NEED_RESET;
4936         case pci_channel_io_perm_failure:
4937                 /* Permanent error, prepare for device removal */
4938                 return PCI_ERS_RESULT_DISCONNECT;
4939         }
4940
4941         return PCI_ERS_RESULT_NEED_RESET;
4942 }
4943
4944 /**
4945  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4946  * @pdev: pointer to PCI device
4947  */
4948 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4949 {
4950
4951         DRM_INFO("PCI error: mmio enabled callback!!\n");
4952
4953         /* TODO - dump whatever for debugging purposes */
4954
4955         /* This called only if amdgpu_pci_error_detected returns
4956          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4957          * works, no need to reset slot.
4958          */
4959
4960         return PCI_ERS_RESULT_RECOVERED;
4961 }
4962
4963 /**
4964  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4965  * @pdev: PCI device struct
4966  *
4967  * Description: This routine is called by the pci error recovery
4968  * code after the PCI slot has been reset, just before we
4969  * should resume normal operations.
4970  */
4971 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4972 {
4973         struct drm_device *dev = pci_get_drvdata(pdev);
4974         struct amdgpu_device *adev = drm_to_adev(dev);
4975         int r, i;
4976         bool need_full_reset = true;
4977         u32 memsize;
4978         struct list_head device_list;
4979
4980         DRM_INFO("PCI error: slot reset callback!!\n");
4981
4982         INIT_LIST_HEAD(&device_list);
4983         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4984
4985         /* wait for asic to come out of reset */
4986         msleep(500);
4987
4988         /* Restore PCI confspace */
4989         amdgpu_device_load_pci_state(pdev);
4990
4991         /* confirm  ASIC came out of reset */
4992         for (i = 0; i < adev->usec_timeout; i++) {
4993                 memsize = amdgpu_asic_get_config_memsize(adev);
4994
4995                 if (memsize != 0xffffffff)
4996                         break;
4997                 udelay(1);
4998         }
4999         if (memsize == 0xffffffff) {
5000                 r = -ETIME;
5001                 goto out;
5002         }
5003
5004         adev->in_pci_err_recovery = true;       
5005         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5006         adev->in_pci_err_recovery = false;
5007         if (r)
5008                 goto out;
5009
5010         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5011
5012 out:
5013         if (!r) {
5014                 if (amdgpu_device_cache_pci_state(adev->pdev))
5015                         pci_restore_state(adev->pdev);
5016
5017                 DRM_INFO("PCIe error recovery succeeded\n");
5018         } else {
5019                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5020                 amdgpu_device_unlock_adev(adev);
5021         }
5022
5023         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5024 }
5025
5026 /**
5027  * amdgpu_pci_resume() - resume normal ops after PCI reset
5028  * @pdev: pointer to PCI device
5029  *
5030  * Called when the error recovery driver tells us that its
5031  * OK to resume normal operation. Use completion to allow
5032  * halted scsi ops to resume.
5033  */
5034 void amdgpu_pci_resume(struct pci_dev *pdev)
5035 {
5036         struct drm_device *dev = pci_get_drvdata(pdev);
5037         struct amdgpu_device *adev = drm_to_adev(dev);
5038         int i;
5039
5040
5041         DRM_INFO("PCI error: resume callback!!\n");
5042
5043         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5044                 struct amdgpu_ring *ring = adev->rings[i];
5045
5046                 if (!ring || !ring->sched.thread)
5047                         continue;
5048
5049
5050                 drm_sched_resubmit_jobs(&ring->sched);
5051                 drm_sched_start(&ring->sched, true);
5052         }
5053
5054         amdgpu_device_unlock_adev(adev);
5055 }
5056
5057 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5058 {
5059         struct drm_device *dev = pci_get_drvdata(pdev);
5060         struct amdgpu_device *adev = drm_to_adev(dev);
5061         int r;
5062
5063         r = pci_save_state(pdev);
5064         if (!r) {
5065                 kfree(adev->pci_state);
5066
5067                 adev->pci_state = pci_store_saved_state(pdev);
5068
5069                 if (!adev->pci_state) {
5070                         DRM_ERROR("Failed to store PCI saved state");
5071                         return false;
5072                 }
5073         } else {
5074                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5075                 return false;
5076         }
5077
5078         return true;
5079 }
5080
5081 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5082 {
5083         struct drm_device *dev = pci_get_drvdata(pdev);
5084         struct amdgpu_device *adev = drm_to_adev(dev);
5085         int r;
5086
5087         if (!adev->pci_state)
5088                 return false;
5089
5090         r = pci_load_saved_state(pdev, adev->pci_state);
5091
5092         if (!r) {
5093                 pci_restore_state(pdev);
5094         } else {
5095                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5096                 return false;
5097         }
5098
5099         return true;
5100 }
5101
5102
This page took 0.340638 seconds and 4 git commands to generate.