]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm/amdgpu: use drm_device pointer directly rather than convert again
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77
78 #include <linux/suspend.h>
79 #include <drm/task_barrier.h>
80 #include <linux/pm_runtime.h>
81
82 #include <drm/drm_drv.h>
83
84 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
89 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
91
92 #define AMDGPU_RESUME_MS                2000
93 #define AMDGPU_MAX_RETRY_LIMIT          2
94 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
95
96 static const struct drm_driver amdgpu_kms_driver;
97
98 const char *amdgpu_asic_name[] = {
99         "TAHITI",
100         "PITCAIRN",
101         "VERDE",
102         "OLAND",
103         "HAINAN",
104         "BONAIRE",
105         "KAVERI",
106         "KABINI",
107         "HAWAII",
108         "MULLINS",
109         "TOPAZ",
110         "TONGA",
111         "FIJI",
112         "CARRIZO",
113         "STONEY",
114         "POLARIS10",
115         "POLARIS11",
116         "POLARIS12",
117         "VEGAM",
118         "VEGA10",
119         "VEGA12",
120         "VEGA20",
121         "RAVEN",
122         "ARCTURUS",
123         "RENOIR",
124         "ALDEBARAN",
125         "NAVI10",
126         "CYAN_SKILLFISH",
127         "NAVI14",
128         "NAVI12",
129         "SIENNA_CICHLID",
130         "NAVY_FLOUNDER",
131         "VANGOGH",
132         "DIMGREY_CAVEFISH",
133         "BEIGE_GOBY",
134         "YELLOW_CARP",
135         "IP DISCOVERY",
136         "LAST",
137 };
138
139 /**
140  * DOC: pcie_replay_count
141  *
142  * The amdgpu driver provides a sysfs API for reporting the total number
143  * of PCIe replays (NAKs)
144  * The file pcie_replay_count is used for this and returns the total
145  * number of replays as a sum of the NAKs generated and NAKs received
146  */
147
148 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
149                 struct device_attribute *attr, char *buf)
150 {
151         struct drm_device *ddev = dev_get_drvdata(dev);
152         struct amdgpu_device *adev = drm_to_adev(ddev);
153         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
154
155         return sysfs_emit(buf, "%llu\n", cnt);
156 }
157
158 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
159                 amdgpu_device_get_pcie_replay_count, NULL);
160
161 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
162
163 /**
164  * DOC: product_name
165  *
166  * The amdgpu driver provides a sysfs API for reporting the product name
167  * for the device
168  * The file product_name is used for this and returns the product name
169  * as returned from the FRU.
170  * NOTE: This is only available for certain server cards
171  */
172
173 static ssize_t amdgpu_device_get_product_name(struct device *dev,
174                 struct device_attribute *attr, char *buf)
175 {
176         struct drm_device *ddev = dev_get_drvdata(dev);
177         struct amdgpu_device *adev = drm_to_adev(ddev);
178
179         return sysfs_emit(buf, "%s\n", adev->product_name);
180 }
181
182 static DEVICE_ATTR(product_name, S_IRUGO,
183                 amdgpu_device_get_product_name, NULL);
184
185 /**
186  * DOC: product_number
187  *
188  * The amdgpu driver provides a sysfs API for reporting the part number
189  * for the device
190  * The file product_number is used for this and returns the part number
191  * as returned from the FRU.
192  * NOTE: This is only available for certain server cards
193  */
194
195 static ssize_t amdgpu_device_get_product_number(struct device *dev,
196                 struct device_attribute *attr, char *buf)
197 {
198         struct drm_device *ddev = dev_get_drvdata(dev);
199         struct amdgpu_device *adev = drm_to_adev(ddev);
200
201         return sysfs_emit(buf, "%s\n", adev->product_number);
202 }
203
204 static DEVICE_ATTR(product_number, S_IRUGO,
205                 amdgpu_device_get_product_number, NULL);
206
207 /**
208  * DOC: serial_number
209  *
210  * The amdgpu driver provides a sysfs API for reporting the serial number
211  * for the device
212  * The file serial_number is used for this and returns the serial number
213  * as returned from the FRU.
214  * NOTE: This is only available for certain server cards
215  */
216
217 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
218                 struct device_attribute *attr, char *buf)
219 {
220         struct drm_device *ddev = dev_get_drvdata(dev);
221         struct amdgpu_device *adev = drm_to_adev(ddev);
222
223         return sysfs_emit(buf, "%s\n", adev->serial);
224 }
225
226 static DEVICE_ATTR(serial_number, S_IRUGO,
227                 amdgpu_device_get_serial_number, NULL);
228
229 /**
230  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
231  *
232  * @dev: drm_device pointer
233  *
234  * Returns true if the device is a dGPU with ATPX power control,
235  * otherwise return false.
236  */
237 bool amdgpu_device_supports_px(struct drm_device *dev)
238 {
239         struct amdgpu_device *adev = drm_to_adev(dev);
240
241         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
242                 return true;
243         return false;
244 }
245
246 /**
247  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
248  *
249  * @dev: drm_device pointer
250  *
251  * Returns true if the device is a dGPU with ACPI power control,
252  * otherwise return false.
253  */
254 bool amdgpu_device_supports_boco(struct drm_device *dev)
255 {
256         struct amdgpu_device *adev = drm_to_adev(dev);
257
258         if (adev->has_pr3 ||
259             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
260                 return true;
261         return false;
262 }
263
264 /**
265  * amdgpu_device_supports_baco - Does the device support BACO
266  *
267  * @dev: drm_device pointer
268  *
269  * Returns true if the device supporte BACO,
270  * otherwise return false.
271  */
272 bool amdgpu_device_supports_baco(struct drm_device *dev)
273 {
274         struct amdgpu_device *adev = drm_to_adev(dev);
275
276         return amdgpu_asic_supports_baco(adev);
277 }
278
279 /**
280  * amdgpu_device_supports_smart_shift - Is the device dGPU with
281  * smart shift support
282  *
283  * @dev: drm_device pointer
284  *
285  * Returns true if the device is a dGPU with Smart Shift support,
286  * otherwise returns false.
287  */
288 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
289 {
290         return (amdgpu_device_supports_boco(dev) &&
291                 amdgpu_acpi_is_power_shift_control_supported());
292 }
293
294 /*
295  * VRAM access helper functions
296  */
297
298 /**
299  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
300  *
301  * @adev: amdgpu_device pointer
302  * @pos: offset of the buffer in vram
303  * @buf: virtual address of the buffer in system memory
304  * @size: read/write size, sizeof(@buf) must > @size
305  * @write: true - write to vram, otherwise - read from vram
306  */
307 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
308                              void *buf, size_t size, bool write)
309 {
310         unsigned long flags;
311         uint32_t hi = ~0, tmp = 0;
312         uint32_t *data = buf;
313         uint64_t last;
314         int idx;
315
316         if (!drm_dev_enter(adev_to_drm(adev), &idx))
317                 return;
318
319         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
320
321         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
322         for (last = pos + size; pos < last; pos += 4) {
323                 tmp = pos >> 31;
324
325                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
326                 if (tmp != hi) {
327                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
328                         hi = tmp;
329                 }
330                 if (write)
331                         WREG32_NO_KIQ(mmMM_DATA, *data++);
332                 else
333                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
334         }
335
336         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
337         drm_dev_exit(idx);
338 }
339
340 /**
341  * amdgpu_device_aper_access - access vram by vram aperature
342  *
343  * @adev: amdgpu_device pointer
344  * @pos: offset of the buffer in vram
345  * @buf: virtual address of the buffer in system memory
346  * @size: read/write size, sizeof(@buf) must > @size
347  * @write: true - write to vram, otherwise - read from vram
348  *
349  * The return value means how many bytes have been transferred.
350  */
351 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
352                                  void *buf, size_t size, bool write)
353 {
354 #ifdef CONFIG_64BIT
355         void __iomem *addr;
356         size_t count = 0;
357         uint64_t last;
358
359         if (!adev->mman.aper_base_kaddr)
360                 return 0;
361
362         last = min(pos + size, adev->gmc.visible_vram_size);
363         if (last > pos) {
364                 addr = adev->mman.aper_base_kaddr + pos;
365                 count = last - pos;
366
367                 if (write) {
368                         memcpy_toio(addr, buf, count);
369                         mb();
370                         amdgpu_device_flush_hdp(adev, NULL);
371                 } else {
372                         amdgpu_device_invalidate_hdp(adev, NULL);
373                         mb();
374                         memcpy_fromio(buf, addr, count);
375                 }
376
377         }
378
379         return count;
380 #else
381         return 0;
382 #endif
383 }
384
385 /**
386  * amdgpu_device_vram_access - read/write a buffer in vram
387  *
388  * @adev: amdgpu_device pointer
389  * @pos: offset of the buffer in vram
390  * @buf: virtual address of the buffer in system memory
391  * @size: read/write size, sizeof(@buf) must > @size
392  * @write: true - write to vram, otherwise - read from vram
393  */
394 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
395                                void *buf, size_t size, bool write)
396 {
397         size_t count;
398
399         /* try to using vram apreature to access vram first */
400         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
401         size -= count;
402         if (size) {
403                 /* using MM to access rest vram */
404                 pos += count;
405                 buf += count;
406                 amdgpu_device_mm_access(adev, pos, buf, size, write);
407         }
408 }
409
410 /*
411  * register access helper functions.
412  */
413
414 /* Check if hw access should be skipped because of hotplug or device error */
415 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
416 {
417         if (adev->no_hw_access)
418                 return true;
419
420 #ifdef CONFIG_LOCKDEP
421         /*
422          * This is a bit complicated to understand, so worth a comment. What we assert
423          * here is that the GPU reset is not running on another thread in parallel.
424          *
425          * For this we trylock the read side of the reset semaphore, if that succeeds
426          * we know that the reset is not running in paralell.
427          *
428          * If the trylock fails we assert that we are either already holding the read
429          * side of the lock or are the reset thread itself and hold the write side of
430          * the lock.
431          */
432         if (in_task()) {
433                 if (down_read_trylock(&adev->reset_domain->sem))
434                         up_read(&adev->reset_domain->sem);
435                 else
436                         lockdep_assert_held(&adev->reset_domain->sem);
437         }
438 #endif
439         return false;
440 }
441
442 /**
443  * amdgpu_device_rreg - read a memory mapped IO or indirect register
444  *
445  * @adev: amdgpu_device pointer
446  * @reg: dword aligned register offset
447  * @acc_flags: access flags which require special behavior
448  *
449  * Returns the 32 bit value from the offset specified.
450  */
451 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
452                             uint32_t reg, uint32_t acc_flags)
453 {
454         uint32_t ret;
455
456         if (amdgpu_device_skip_hw_access(adev))
457                 return 0;
458
459         if ((reg * 4) < adev->rmmio_size) {
460                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
461                     amdgpu_sriov_runtime(adev) &&
462                     down_read_trylock(&adev->reset_domain->sem)) {
463                         ret = amdgpu_kiq_rreg(adev, reg);
464                         up_read(&adev->reset_domain->sem);
465                 } else {
466                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
467                 }
468         } else {
469                 ret = adev->pcie_rreg(adev, reg * 4);
470         }
471
472         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
473
474         return ret;
475 }
476
477 /*
478  * MMIO register read with bytes helper functions
479  * @offset:bytes offset from MMIO start
480  *
481 */
482
483 /**
484  * amdgpu_mm_rreg8 - read a memory mapped IO register
485  *
486  * @adev: amdgpu_device pointer
487  * @offset: byte aligned register offset
488  *
489  * Returns the 8 bit value from the offset specified.
490  */
491 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
492 {
493         if (amdgpu_device_skip_hw_access(adev))
494                 return 0;
495
496         if (offset < adev->rmmio_size)
497                 return (readb(adev->rmmio + offset));
498         BUG();
499 }
500
501 /*
502  * MMIO register write with bytes helper functions
503  * @offset:bytes offset from MMIO start
504  * @value: the value want to be written to the register
505  *
506 */
507 /**
508  * amdgpu_mm_wreg8 - read a memory mapped IO register
509  *
510  * @adev: amdgpu_device pointer
511  * @offset: byte aligned register offset
512  * @value: 8 bit value to write
513  *
514  * Writes the value specified to the offset specified.
515  */
516 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
517 {
518         if (amdgpu_device_skip_hw_access(adev))
519                 return;
520
521         if (offset < adev->rmmio_size)
522                 writeb(value, adev->rmmio + offset);
523         else
524                 BUG();
525 }
526
527 /**
528  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
529  *
530  * @adev: amdgpu_device pointer
531  * @reg: dword aligned register offset
532  * @v: 32 bit value to write to the register
533  * @acc_flags: access flags which require special behavior
534  *
535  * Writes the value specified to the offset specified.
536  */
537 void amdgpu_device_wreg(struct amdgpu_device *adev,
538                         uint32_t reg, uint32_t v,
539                         uint32_t acc_flags)
540 {
541         if (amdgpu_device_skip_hw_access(adev))
542                 return;
543
544         if ((reg * 4) < adev->rmmio_size) {
545                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
546                     amdgpu_sriov_runtime(adev) &&
547                     down_read_trylock(&adev->reset_domain->sem)) {
548                         amdgpu_kiq_wreg(adev, reg, v);
549                         up_read(&adev->reset_domain->sem);
550                 } else {
551                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
552                 }
553         } else {
554                 adev->pcie_wreg(adev, reg * 4, v);
555         }
556
557         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
558 }
559
560 /**
561  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
562  *
563  * @adev: amdgpu_device pointer
564  * @reg: mmio/rlc register
565  * @v: value to write
566  *
567  * this function is invoked only for the debugfs register access
568  */
569 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
570                              uint32_t reg, uint32_t v)
571 {
572         if (amdgpu_device_skip_hw_access(adev))
573                 return;
574
575         if (amdgpu_sriov_fullaccess(adev) &&
576             adev->gfx.rlc.funcs &&
577             adev->gfx.rlc.funcs->is_rlcg_access_range) {
578                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
579                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
580         } else if ((reg * 4) >= adev->rmmio_size) {
581                 adev->pcie_wreg(adev, reg * 4, v);
582         } else {
583                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
584         }
585 }
586
587 /**
588  * amdgpu_mm_rdoorbell - read a doorbell dword
589  *
590  * @adev: amdgpu_device pointer
591  * @index: doorbell index
592  *
593  * Returns the value in the doorbell aperture at the
594  * requested doorbell index (CIK).
595  */
596 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
597 {
598         if (amdgpu_device_skip_hw_access(adev))
599                 return 0;
600
601         if (index < adev->doorbell.num_doorbells) {
602                 return readl(adev->doorbell.ptr + index);
603         } else {
604                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
605                 return 0;
606         }
607 }
608
609 /**
610  * amdgpu_mm_wdoorbell - write a doorbell dword
611  *
612  * @adev: amdgpu_device pointer
613  * @index: doorbell index
614  * @v: value to write
615  *
616  * Writes @v to the doorbell aperture at the
617  * requested doorbell index (CIK).
618  */
619 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
620 {
621         if (amdgpu_device_skip_hw_access(adev))
622                 return;
623
624         if (index < adev->doorbell.num_doorbells) {
625                 writel(v, adev->doorbell.ptr + index);
626         } else {
627                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
628         }
629 }
630
631 /**
632  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
633  *
634  * @adev: amdgpu_device pointer
635  * @index: doorbell index
636  *
637  * Returns the value in the doorbell aperture at the
638  * requested doorbell index (VEGA10+).
639  */
640 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
641 {
642         if (amdgpu_device_skip_hw_access(adev))
643                 return 0;
644
645         if (index < adev->doorbell.num_doorbells) {
646                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
647         } else {
648                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
649                 return 0;
650         }
651 }
652
653 /**
654  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
655  *
656  * @adev: amdgpu_device pointer
657  * @index: doorbell index
658  * @v: value to write
659  *
660  * Writes @v to the doorbell aperture at the
661  * requested doorbell index (VEGA10+).
662  */
663 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
664 {
665         if (amdgpu_device_skip_hw_access(adev))
666                 return;
667
668         if (index < adev->doorbell.num_doorbells) {
669                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
670         } else {
671                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
672         }
673 }
674
675 /**
676  * amdgpu_device_indirect_rreg - read an indirect register
677  *
678  * @adev: amdgpu_device pointer
679  * @pcie_index: mmio register offset
680  * @pcie_data: mmio register offset
681  * @reg_addr: indirect register address to read from
682  *
683  * Returns the value of indirect register @reg_addr
684  */
685 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
686                                 u32 pcie_index, u32 pcie_data,
687                                 u32 reg_addr)
688 {
689         unsigned long flags;
690         u32 r;
691         void __iomem *pcie_index_offset;
692         void __iomem *pcie_data_offset;
693
694         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
695         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
696         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
697
698         writel(reg_addr, pcie_index_offset);
699         readl(pcie_index_offset);
700         r = readl(pcie_data_offset);
701         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
702
703         return r;
704 }
705
706 /**
707  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
708  *
709  * @adev: amdgpu_device pointer
710  * @pcie_index: mmio register offset
711  * @pcie_data: mmio register offset
712  * @reg_addr: indirect register address to read from
713  *
714  * Returns the value of indirect register @reg_addr
715  */
716 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
717                                   u32 pcie_index, u32 pcie_data,
718                                   u32 reg_addr)
719 {
720         unsigned long flags;
721         u64 r;
722         void __iomem *pcie_index_offset;
723         void __iomem *pcie_data_offset;
724
725         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
726         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
727         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
728
729         /* read low 32 bits */
730         writel(reg_addr, pcie_index_offset);
731         readl(pcie_index_offset);
732         r = readl(pcie_data_offset);
733         /* read high 32 bits */
734         writel(reg_addr + 4, pcie_index_offset);
735         readl(pcie_index_offset);
736         r |= ((u64)readl(pcie_data_offset) << 32);
737         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
738
739         return r;
740 }
741
742 /**
743  * amdgpu_device_indirect_wreg - write an indirect register address
744  *
745  * @adev: amdgpu_device pointer
746  * @pcie_index: mmio register offset
747  * @pcie_data: mmio register offset
748  * @reg_addr: indirect register offset
749  * @reg_data: indirect register data
750  *
751  */
752 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
753                                  u32 pcie_index, u32 pcie_data,
754                                  u32 reg_addr, u32 reg_data)
755 {
756         unsigned long flags;
757         void __iomem *pcie_index_offset;
758         void __iomem *pcie_data_offset;
759
760         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
761         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
762         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
763
764         writel(reg_addr, pcie_index_offset);
765         readl(pcie_index_offset);
766         writel(reg_data, pcie_data_offset);
767         readl(pcie_data_offset);
768         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
769 }
770
771 /**
772  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
773  *
774  * @adev: amdgpu_device pointer
775  * @pcie_index: mmio register offset
776  * @pcie_data: mmio register offset
777  * @reg_addr: indirect register offset
778  * @reg_data: indirect register data
779  *
780  */
781 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
782                                    u32 pcie_index, u32 pcie_data,
783                                    u32 reg_addr, u64 reg_data)
784 {
785         unsigned long flags;
786         void __iomem *pcie_index_offset;
787         void __iomem *pcie_data_offset;
788
789         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
790         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
791         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
792
793         /* write low 32 bits */
794         writel(reg_addr, pcie_index_offset);
795         readl(pcie_index_offset);
796         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
797         readl(pcie_data_offset);
798         /* write high 32 bits */
799         writel(reg_addr + 4, pcie_index_offset);
800         readl(pcie_index_offset);
801         writel((u32)(reg_data >> 32), pcie_data_offset);
802         readl(pcie_data_offset);
803         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
804 }
805
806 /**
807  * amdgpu_invalid_rreg - dummy reg read function
808  *
809  * @adev: amdgpu_device pointer
810  * @reg: offset of register
811  *
812  * Dummy register read function.  Used for register blocks
813  * that certain asics don't have (all asics).
814  * Returns the value in the register.
815  */
816 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
817 {
818         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
819         BUG();
820         return 0;
821 }
822
823 /**
824  * amdgpu_invalid_wreg - dummy reg write function
825  *
826  * @adev: amdgpu_device pointer
827  * @reg: offset of register
828  * @v: value to write to the register
829  *
830  * Dummy register read function.  Used for register blocks
831  * that certain asics don't have (all asics).
832  */
833 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
834 {
835         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
836                   reg, v);
837         BUG();
838 }
839
840 /**
841  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
842  *
843  * @adev: amdgpu_device pointer
844  * @reg: offset of register
845  *
846  * Dummy register read function.  Used for register blocks
847  * that certain asics don't have (all asics).
848  * Returns the value in the register.
849  */
850 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
851 {
852         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
853         BUG();
854         return 0;
855 }
856
857 /**
858  * amdgpu_invalid_wreg64 - dummy reg write function
859  *
860  * @adev: amdgpu_device pointer
861  * @reg: offset of register
862  * @v: value to write to the register
863  *
864  * Dummy register read function.  Used for register blocks
865  * that certain asics don't have (all asics).
866  */
867 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
868 {
869         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
870                   reg, v);
871         BUG();
872 }
873
874 /**
875  * amdgpu_block_invalid_rreg - dummy reg read function
876  *
877  * @adev: amdgpu_device pointer
878  * @block: offset of instance
879  * @reg: offset of register
880  *
881  * Dummy register read function.  Used for register blocks
882  * that certain asics don't have (all asics).
883  * Returns the value in the register.
884  */
885 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
886                                           uint32_t block, uint32_t reg)
887 {
888         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
889                   reg, block);
890         BUG();
891         return 0;
892 }
893
894 /**
895  * amdgpu_block_invalid_wreg - dummy reg write function
896  *
897  * @adev: amdgpu_device pointer
898  * @block: offset of instance
899  * @reg: offset of register
900  * @v: value to write to the register
901  *
902  * Dummy register read function.  Used for register blocks
903  * that certain asics don't have (all asics).
904  */
905 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
906                                       uint32_t block,
907                                       uint32_t reg, uint32_t v)
908 {
909         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
910                   reg, block, v);
911         BUG();
912 }
913
914 /**
915  * amdgpu_device_asic_init - Wrapper for atom asic_init
916  *
917  * @adev: amdgpu_device pointer
918  *
919  * Does any asic specific work and then calls atom asic init.
920  */
921 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
922 {
923         amdgpu_asic_pre_asic_init(adev);
924
925         if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
926                 return amdgpu_atomfirmware_asic_init(adev, true);
927         else
928                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
929 }
930
931 /**
932  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
933  *
934  * @adev: amdgpu_device pointer
935  *
936  * Allocates a scratch page of VRAM for use by various things in the
937  * driver.
938  */
939 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
940 {
941         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
942                                        AMDGPU_GEM_DOMAIN_VRAM |
943                                        AMDGPU_GEM_DOMAIN_GTT,
944                                        &adev->mem_scratch.robj,
945                                        &adev->mem_scratch.gpu_addr,
946                                        (void **)&adev->mem_scratch.ptr);
947 }
948
949 /**
950  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
951  *
952  * @adev: amdgpu_device pointer
953  *
954  * Frees the VRAM scratch page.
955  */
956 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
957 {
958         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
959 }
960
961 /**
962  * amdgpu_device_program_register_sequence - program an array of registers.
963  *
964  * @adev: amdgpu_device pointer
965  * @registers: pointer to the register array
966  * @array_size: size of the register array
967  *
968  * Programs an array or registers with and and or masks.
969  * This is a helper for setting golden registers.
970  */
971 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
972                                              const u32 *registers,
973                                              const u32 array_size)
974 {
975         u32 tmp, reg, and_mask, or_mask;
976         int i;
977
978         if (array_size % 3)
979                 return;
980
981         for (i = 0; i < array_size; i +=3) {
982                 reg = registers[i + 0];
983                 and_mask = registers[i + 1];
984                 or_mask = registers[i + 2];
985
986                 if (and_mask == 0xffffffff) {
987                         tmp = or_mask;
988                 } else {
989                         tmp = RREG32(reg);
990                         tmp &= ~and_mask;
991                         if (adev->family >= AMDGPU_FAMILY_AI)
992                                 tmp |= (or_mask & and_mask);
993                         else
994                                 tmp |= or_mask;
995                 }
996                 WREG32(reg, tmp);
997         }
998 }
999
1000 /**
1001  * amdgpu_device_pci_config_reset - reset the GPU
1002  *
1003  * @adev: amdgpu_device pointer
1004  *
1005  * Resets the GPU using the pci config reset sequence.
1006  * Only applicable to asics prior to vega10.
1007  */
1008 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1009 {
1010         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1011 }
1012
1013 /**
1014  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1015  *
1016  * @adev: amdgpu_device pointer
1017  *
1018  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1019  */
1020 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1021 {
1022         return pci_reset_function(adev->pdev);
1023 }
1024
1025 /*
1026  * GPU doorbell aperture helpers function.
1027  */
1028 /**
1029  * amdgpu_device_doorbell_init - Init doorbell driver information.
1030  *
1031  * @adev: amdgpu_device pointer
1032  *
1033  * Init doorbell driver information (CIK)
1034  * Returns 0 on success, error on failure.
1035  */
1036 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1037 {
1038
1039         /* No doorbell on SI hardware generation */
1040         if (adev->asic_type < CHIP_BONAIRE) {
1041                 adev->doorbell.base = 0;
1042                 adev->doorbell.size = 0;
1043                 adev->doorbell.num_doorbells = 0;
1044                 adev->doorbell.ptr = NULL;
1045                 return 0;
1046         }
1047
1048         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1049                 return -EINVAL;
1050
1051         amdgpu_asic_init_doorbell_index(adev);
1052
1053         /* doorbell bar mapping */
1054         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1055         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1056
1057         if (adev->enable_mes) {
1058                 adev->doorbell.num_doorbells =
1059                         adev->doorbell.size / sizeof(u32);
1060         } else {
1061                 adev->doorbell.num_doorbells =
1062                         min_t(u32, adev->doorbell.size / sizeof(u32),
1063                               adev->doorbell_index.max_assignment+1);
1064                 if (adev->doorbell.num_doorbells == 0)
1065                         return -EINVAL;
1066
1067                 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1068                  * paging queue doorbell use the second page. The
1069                  * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1070                  * doorbells are in the first page. So with paging queue enabled,
1071                  * the max num_doorbells should + 1 page (0x400 in dword)
1072                  */
1073                 if (adev->asic_type >= CHIP_VEGA10)
1074                         adev->doorbell.num_doorbells += 0x400;
1075         }
1076
1077         adev->doorbell.ptr = ioremap(adev->doorbell.base,
1078                                      adev->doorbell.num_doorbells *
1079                                      sizeof(u32));
1080         if (adev->doorbell.ptr == NULL)
1081                 return -ENOMEM;
1082
1083         return 0;
1084 }
1085
1086 /**
1087  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1088  *
1089  * @adev: amdgpu_device pointer
1090  *
1091  * Tear down doorbell driver information (CIK)
1092  */
1093 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1094 {
1095         iounmap(adev->doorbell.ptr);
1096         adev->doorbell.ptr = NULL;
1097 }
1098
1099
1100
1101 /*
1102  * amdgpu_device_wb_*()
1103  * Writeback is the method by which the GPU updates special pages in memory
1104  * with the status of certain GPU events (fences, ring pointers,etc.).
1105  */
1106
1107 /**
1108  * amdgpu_device_wb_fini - Disable Writeback and free memory
1109  *
1110  * @adev: amdgpu_device pointer
1111  *
1112  * Disables Writeback and frees the Writeback memory (all asics).
1113  * Used at driver shutdown.
1114  */
1115 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1116 {
1117         if (adev->wb.wb_obj) {
1118                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1119                                       &adev->wb.gpu_addr,
1120                                       (void **)&adev->wb.wb);
1121                 adev->wb.wb_obj = NULL;
1122         }
1123 }
1124
1125 /**
1126  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1127  *
1128  * @adev: amdgpu_device pointer
1129  *
1130  * Initializes writeback and allocates writeback memory (all asics).
1131  * Used at driver startup.
1132  * Returns 0 on success or an -error on failure.
1133  */
1134 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1135 {
1136         int r;
1137
1138         if (adev->wb.wb_obj == NULL) {
1139                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1140                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1141                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1142                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1143                                             (void **)&adev->wb.wb);
1144                 if (r) {
1145                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1146                         return r;
1147                 }
1148
1149                 adev->wb.num_wb = AMDGPU_MAX_WB;
1150                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1151
1152                 /* clear wb memory */
1153                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1154         }
1155
1156         return 0;
1157 }
1158
1159 /**
1160  * amdgpu_device_wb_get - Allocate a wb entry
1161  *
1162  * @adev: amdgpu_device pointer
1163  * @wb: wb index
1164  *
1165  * Allocate a wb slot for use by the driver (all asics).
1166  * Returns 0 on success or -EINVAL on failure.
1167  */
1168 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1169 {
1170         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1171
1172         if (offset < adev->wb.num_wb) {
1173                 __set_bit(offset, adev->wb.used);
1174                 *wb = offset << 3; /* convert to dw offset */
1175                 return 0;
1176         } else {
1177                 return -EINVAL;
1178         }
1179 }
1180
1181 /**
1182  * amdgpu_device_wb_free - Free a wb entry
1183  *
1184  * @adev: amdgpu_device pointer
1185  * @wb: wb index
1186  *
1187  * Free a wb slot allocated for use by the driver (all asics)
1188  */
1189 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1190 {
1191         wb >>= 3;
1192         if (wb < adev->wb.num_wb)
1193                 __clear_bit(wb, adev->wb.used);
1194 }
1195
1196 /**
1197  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1198  *
1199  * @adev: amdgpu_device pointer
1200  *
1201  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1202  * to fail, but if any of the BARs is not accessible after the size we abort
1203  * driver loading by returning -ENODEV.
1204  */
1205 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1206 {
1207         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1208         struct pci_bus *root;
1209         struct resource *res;
1210         unsigned i;
1211         u16 cmd;
1212         int r;
1213
1214         /* Bypass for VF */
1215         if (amdgpu_sriov_vf(adev))
1216                 return 0;
1217
1218         /* skip if the bios has already enabled large BAR */
1219         if (adev->gmc.real_vram_size &&
1220             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1221                 return 0;
1222
1223         /* Check if the root BUS has 64bit memory resources */
1224         root = adev->pdev->bus;
1225         while (root->parent)
1226                 root = root->parent;
1227
1228         pci_bus_for_each_resource(root, res, i) {
1229                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1230                     res->start > 0x100000000ull)
1231                         break;
1232         }
1233
1234         /* Trying to resize is pointless without a root hub window above 4GB */
1235         if (!res)
1236                 return 0;
1237
1238         /* Limit the BAR size to what is available */
1239         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1240                         rbar_size);
1241
1242         /* Disable memory decoding while we change the BAR addresses and size */
1243         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1244         pci_write_config_word(adev->pdev, PCI_COMMAND,
1245                               cmd & ~PCI_COMMAND_MEMORY);
1246
1247         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1248         amdgpu_device_doorbell_fini(adev);
1249         if (adev->asic_type >= CHIP_BONAIRE)
1250                 pci_release_resource(adev->pdev, 2);
1251
1252         pci_release_resource(adev->pdev, 0);
1253
1254         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1255         if (r == -ENOSPC)
1256                 DRM_INFO("Not enough PCI address space for a large BAR.");
1257         else if (r && r != -ENOTSUPP)
1258                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1259
1260         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1261
1262         /* When the doorbell or fb BAR isn't available we have no chance of
1263          * using the device.
1264          */
1265         r = amdgpu_device_doorbell_init(adev);
1266         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1267                 return -ENODEV;
1268
1269         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1270
1271         return 0;
1272 }
1273
1274 /*
1275  * GPU helpers function.
1276  */
1277 /**
1278  * amdgpu_device_need_post - check if the hw need post or not
1279  *
1280  * @adev: amdgpu_device pointer
1281  *
1282  * Check if the asic has been initialized (all asics) at driver startup
1283  * or post is needed if  hw reset is performed.
1284  * Returns true if need or false if not.
1285  */
1286 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1287 {
1288         uint32_t reg;
1289
1290         if (amdgpu_sriov_vf(adev))
1291                 return false;
1292
1293         if (amdgpu_passthrough(adev)) {
1294                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1295                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1296                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1297                  * vpost executed for smc version below 22.15
1298                  */
1299                 if (adev->asic_type == CHIP_FIJI) {
1300                         int err;
1301                         uint32_t fw_ver;
1302                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1303                         /* force vPost if error occured */
1304                         if (err)
1305                                 return true;
1306
1307                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1308                         if (fw_ver < 0x00160e00)
1309                                 return true;
1310                 }
1311         }
1312
1313         /* Don't post if we need to reset whole hive on init */
1314         if (adev->gmc.xgmi.pending_reset)
1315                 return false;
1316
1317         if (adev->has_hw_reset) {
1318                 adev->has_hw_reset = false;
1319                 return true;
1320         }
1321
1322         /* bios scratch used on CIK+ */
1323         if (adev->asic_type >= CHIP_BONAIRE)
1324                 return amdgpu_atombios_scratch_need_asic_init(adev);
1325
1326         /* check MEM_SIZE for older asics */
1327         reg = amdgpu_asic_get_config_memsize(adev);
1328
1329         if ((reg != 0) && (reg != 0xffffffff))
1330                 return false;
1331
1332         return true;
1333 }
1334
1335 /**
1336  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1337  *
1338  * @adev: amdgpu_device pointer
1339  *
1340  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1341  * be set for this device.
1342  *
1343  * Returns true if it should be used or false if not.
1344  */
1345 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1346 {
1347         switch (amdgpu_aspm) {
1348         case -1:
1349                 break;
1350         case 0:
1351                 return false;
1352         case 1:
1353                 return true;
1354         default:
1355                 return false;
1356         }
1357         return pcie_aspm_enabled(adev->pdev);
1358 }
1359
1360 /* if we get transitioned to only one device, take VGA back */
1361 /**
1362  * amdgpu_device_vga_set_decode - enable/disable vga decode
1363  *
1364  * @pdev: PCI device pointer
1365  * @state: enable/disable vga decode
1366  *
1367  * Enable/disable vga decode (all asics).
1368  * Returns VGA resource flags.
1369  */
1370 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1371                 bool state)
1372 {
1373         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1374         amdgpu_asic_set_vga_state(adev, state);
1375         if (state)
1376                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1377                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1378         else
1379                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1380 }
1381
1382 /**
1383  * amdgpu_device_check_block_size - validate the vm block size
1384  *
1385  * @adev: amdgpu_device pointer
1386  *
1387  * Validates the vm block size specified via module parameter.
1388  * The vm block size defines number of bits in page table versus page directory,
1389  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1390  * page table and the remaining bits are in the page directory.
1391  */
1392 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1393 {
1394         /* defines number of bits in page table versus page directory,
1395          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1396          * page table and the remaining bits are in the page directory */
1397         if (amdgpu_vm_block_size == -1)
1398                 return;
1399
1400         if (amdgpu_vm_block_size < 9) {
1401                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1402                          amdgpu_vm_block_size);
1403                 amdgpu_vm_block_size = -1;
1404         }
1405 }
1406
1407 /**
1408  * amdgpu_device_check_vm_size - validate the vm size
1409  *
1410  * @adev: amdgpu_device pointer
1411  *
1412  * Validates the vm size in GB specified via module parameter.
1413  * The VM size is the size of the GPU virtual memory space in GB.
1414  */
1415 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1416 {
1417         /* no need to check the default value */
1418         if (amdgpu_vm_size == -1)
1419                 return;
1420
1421         if (amdgpu_vm_size < 1) {
1422                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1423                          amdgpu_vm_size);
1424                 amdgpu_vm_size = -1;
1425         }
1426 }
1427
1428 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1429 {
1430         struct sysinfo si;
1431         bool is_os_64 = (sizeof(void *) == 8);
1432         uint64_t total_memory;
1433         uint64_t dram_size_seven_GB = 0x1B8000000;
1434         uint64_t dram_size_three_GB = 0xB8000000;
1435
1436         if (amdgpu_smu_memory_pool_size == 0)
1437                 return;
1438
1439         if (!is_os_64) {
1440                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1441                 goto def_value;
1442         }
1443         si_meminfo(&si);
1444         total_memory = (uint64_t)si.totalram * si.mem_unit;
1445
1446         if ((amdgpu_smu_memory_pool_size == 1) ||
1447                 (amdgpu_smu_memory_pool_size == 2)) {
1448                 if (total_memory < dram_size_three_GB)
1449                         goto def_value1;
1450         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1451                 (amdgpu_smu_memory_pool_size == 8)) {
1452                 if (total_memory < dram_size_seven_GB)
1453                         goto def_value1;
1454         } else {
1455                 DRM_WARN("Smu memory pool size not supported\n");
1456                 goto def_value;
1457         }
1458         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1459
1460         return;
1461
1462 def_value1:
1463         DRM_WARN("No enough system memory\n");
1464 def_value:
1465         adev->pm.smu_prv_buffer_size = 0;
1466 }
1467
1468 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1469 {
1470         if (!(adev->flags & AMD_IS_APU) ||
1471             adev->asic_type < CHIP_RAVEN)
1472                 return 0;
1473
1474         switch (adev->asic_type) {
1475         case CHIP_RAVEN:
1476                 if (adev->pdev->device == 0x15dd)
1477                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1478                 if (adev->pdev->device == 0x15d8)
1479                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1480                 break;
1481         case CHIP_RENOIR:
1482                 if ((adev->pdev->device == 0x1636) ||
1483                     (adev->pdev->device == 0x164c))
1484                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1485                 else
1486                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1487                 break;
1488         case CHIP_VANGOGH:
1489                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1490                 break;
1491         case CHIP_YELLOW_CARP:
1492                 break;
1493         case CHIP_CYAN_SKILLFISH:
1494                 if ((adev->pdev->device == 0x13FE) ||
1495                     (adev->pdev->device == 0x143F))
1496                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1497                 break;
1498         default:
1499                 break;
1500         }
1501
1502         return 0;
1503 }
1504
1505 /**
1506  * amdgpu_device_check_arguments - validate module params
1507  *
1508  * @adev: amdgpu_device pointer
1509  *
1510  * Validates certain module parameters and updates
1511  * the associated values used by the driver (all asics).
1512  */
1513 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1514 {
1515         if (amdgpu_sched_jobs < 4) {
1516                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1517                          amdgpu_sched_jobs);
1518                 amdgpu_sched_jobs = 4;
1519         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1520                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1521                          amdgpu_sched_jobs);
1522                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1523         }
1524
1525         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1526                 /* gart size must be greater or equal to 32M */
1527                 dev_warn(adev->dev, "gart size (%d) too small\n",
1528                          amdgpu_gart_size);
1529                 amdgpu_gart_size = -1;
1530         }
1531
1532         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1533                 /* gtt size must be greater or equal to 32M */
1534                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1535                                  amdgpu_gtt_size);
1536                 amdgpu_gtt_size = -1;
1537         }
1538
1539         /* valid range is between 4 and 9 inclusive */
1540         if (amdgpu_vm_fragment_size != -1 &&
1541             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1542                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1543                 amdgpu_vm_fragment_size = -1;
1544         }
1545
1546         if (amdgpu_sched_hw_submission < 2) {
1547                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1548                          amdgpu_sched_hw_submission);
1549                 amdgpu_sched_hw_submission = 2;
1550         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1551                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1552                          amdgpu_sched_hw_submission);
1553                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1554         }
1555
1556         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1557                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1558                 amdgpu_reset_method = -1;
1559         }
1560
1561         amdgpu_device_check_smu_prv_buffer_size(adev);
1562
1563         amdgpu_device_check_vm_size(adev);
1564
1565         amdgpu_device_check_block_size(adev);
1566
1567         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1568
1569         return 0;
1570 }
1571
1572 /**
1573  * amdgpu_switcheroo_set_state - set switcheroo state
1574  *
1575  * @pdev: pci dev pointer
1576  * @state: vga_switcheroo state
1577  *
1578  * Callback for the switcheroo driver.  Suspends or resumes
1579  * the asics before or after it is powered up using ACPI methods.
1580  */
1581 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1582                                         enum vga_switcheroo_state state)
1583 {
1584         struct drm_device *dev = pci_get_drvdata(pdev);
1585         int r;
1586
1587         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1588                 return;
1589
1590         if (state == VGA_SWITCHEROO_ON) {
1591                 pr_info("switched on\n");
1592                 /* don't suspend or resume card normally */
1593                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1594
1595                 pci_set_power_state(pdev, PCI_D0);
1596                 amdgpu_device_load_pci_state(pdev);
1597                 r = pci_enable_device(pdev);
1598                 if (r)
1599                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1600                 amdgpu_device_resume(dev, true);
1601
1602                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1603         } else {
1604                 pr_info("switched off\n");
1605                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1606                 amdgpu_device_suspend(dev, true);
1607                 amdgpu_device_cache_pci_state(pdev);
1608                 /* Shut down the device */
1609                 pci_disable_device(pdev);
1610                 pci_set_power_state(pdev, PCI_D3cold);
1611                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1612         }
1613 }
1614
1615 /**
1616  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1617  *
1618  * @pdev: pci dev pointer
1619  *
1620  * Callback for the switcheroo driver.  Check of the switcheroo
1621  * state can be changed.
1622  * Returns true if the state can be changed, false if not.
1623  */
1624 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1625 {
1626         struct drm_device *dev = pci_get_drvdata(pdev);
1627
1628         /*
1629         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1630         * locking inversion with the driver load path. And the access here is
1631         * completely racy anyway. So don't bother with locking for now.
1632         */
1633         return atomic_read(&dev->open_count) == 0;
1634 }
1635
1636 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1637         .set_gpu_state = amdgpu_switcheroo_set_state,
1638         .reprobe = NULL,
1639         .can_switch = amdgpu_switcheroo_can_switch,
1640 };
1641
1642 /**
1643  * amdgpu_device_ip_set_clockgating_state - set the CG state
1644  *
1645  * @dev: amdgpu_device pointer
1646  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1647  * @state: clockgating state (gate or ungate)
1648  *
1649  * Sets the requested clockgating state for all instances of
1650  * the hardware IP specified.
1651  * Returns the error code from the last instance.
1652  */
1653 int amdgpu_device_ip_set_clockgating_state(void *dev,
1654                                            enum amd_ip_block_type block_type,
1655                                            enum amd_clockgating_state state)
1656 {
1657         struct amdgpu_device *adev = dev;
1658         int i, r = 0;
1659
1660         for (i = 0; i < adev->num_ip_blocks; i++) {
1661                 if (!adev->ip_blocks[i].status.valid)
1662                         continue;
1663                 if (adev->ip_blocks[i].version->type != block_type)
1664                         continue;
1665                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1666                         continue;
1667                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1668                         (void *)adev, state);
1669                 if (r)
1670                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1671                                   adev->ip_blocks[i].version->funcs->name, r);
1672         }
1673         return r;
1674 }
1675
1676 /**
1677  * amdgpu_device_ip_set_powergating_state - set the PG state
1678  *
1679  * @dev: amdgpu_device pointer
1680  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1681  * @state: powergating state (gate or ungate)
1682  *
1683  * Sets the requested powergating state for all instances of
1684  * the hardware IP specified.
1685  * Returns the error code from the last instance.
1686  */
1687 int amdgpu_device_ip_set_powergating_state(void *dev,
1688                                            enum amd_ip_block_type block_type,
1689                                            enum amd_powergating_state state)
1690 {
1691         struct amdgpu_device *adev = dev;
1692         int i, r = 0;
1693
1694         for (i = 0; i < adev->num_ip_blocks; i++) {
1695                 if (!adev->ip_blocks[i].status.valid)
1696                         continue;
1697                 if (adev->ip_blocks[i].version->type != block_type)
1698                         continue;
1699                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1700                         continue;
1701                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1702                         (void *)adev, state);
1703                 if (r)
1704                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1705                                   adev->ip_blocks[i].version->funcs->name, r);
1706         }
1707         return r;
1708 }
1709
1710 /**
1711  * amdgpu_device_ip_get_clockgating_state - get the CG state
1712  *
1713  * @adev: amdgpu_device pointer
1714  * @flags: clockgating feature flags
1715  *
1716  * Walks the list of IPs on the device and updates the clockgating
1717  * flags for each IP.
1718  * Updates @flags with the feature flags for each hardware IP where
1719  * clockgating is enabled.
1720  */
1721 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1722                                             u64 *flags)
1723 {
1724         int i;
1725
1726         for (i = 0; i < adev->num_ip_blocks; i++) {
1727                 if (!adev->ip_blocks[i].status.valid)
1728                         continue;
1729                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1730                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1731         }
1732 }
1733
1734 /**
1735  * amdgpu_device_ip_wait_for_idle - wait for idle
1736  *
1737  * @adev: amdgpu_device pointer
1738  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1739  *
1740  * Waits for the request hardware IP to be idle.
1741  * Returns 0 for success or a negative error code on failure.
1742  */
1743 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1744                                    enum amd_ip_block_type block_type)
1745 {
1746         int i, r;
1747
1748         for (i = 0; i < adev->num_ip_blocks; i++) {
1749                 if (!adev->ip_blocks[i].status.valid)
1750                         continue;
1751                 if (adev->ip_blocks[i].version->type == block_type) {
1752                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1753                         if (r)
1754                                 return r;
1755                         break;
1756                 }
1757         }
1758         return 0;
1759
1760 }
1761
1762 /**
1763  * amdgpu_device_ip_is_idle - is the hardware IP idle
1764  *
1765  * @adev: amdgpu_device pointer
1766  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1767  *
1768  * Check if the hardware IP is idle or not.
1769  * Returns true if it the IP is idle, false if not.
1770  */
1771 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1772                               enum amd_ip_block_type block_type)
1773 {
1774         int i;
1775
1776         for (i = 0; i < adev->num_ip_blocks; i++) {
1777                 if (!adev->ip_blocks[i].status.valid)
1778                         continue;
1779                 if (adev->ip_blocks[i].version->type == block_type)
1780                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1781         }
1782         return true;
1783
1784 }
1785
1786 /**
1787  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1788  *
1789  * @adev: amdgpu_device pointer
1790  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1791  *
1792  * Returns a pointer to the hardware IP block structure
1793  * if it exists for the asic, otherwise NULL.
1794  */
1795 struct amdgpu_ip_block *
1796 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1797                               enum amd_ip_block_type type)
1798 {
1799         int i;
1800
1801         for (i = 0; i < adev->num_ip_blocks; i++)
1802                 if (adev->ip_blocks[i].version->type == type)
1803                         return &adev->ip_blocks[i];
1804
1805         return NULL;
1806 }
1807
1808 /**
1809  * amdgpu_device_ip_block_version_cmp
1810  *
1811  * @adev: amdgpu_device pointer
1812  * @type: enum amd_ip_block_type
1813  * @major: major version
1814  * @minor: minor version
1815  *
1816  * return 0 if equal or greater
1817  * return 1 if smaller or the ip_block doesn't exist
1818  */
1819 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1820                                        enum amd_ip_block_type type,
1821                                        u32 major, u32 minor)
1822 {
1823         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1824
1825         if (ip_block && ((ip_block->version->major > major) ||
1826                         ((ip_block->version->major == major) &&
1827                         (ip_block->version->minor >= minor))))
1828                 return 0;
1829
1830         return 1;
1831 }
1832
1833 /**
1834  * amdgpu_device_ip_block_add
1835  *
1836  * @adev: amdgpu_device pointer
1837  * @ip_block_version: pointer to the IP to add
1838  *
1839  * Adds the IP block driver information to the collection of IPs
1840  * on the asic.
1841  */
1842 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1843                                const struct amdgpu_ip_block_version *ip_block_version)
1844 {
1845         if (!ip_block_version)
1846                 return -EINVAL;
1847
1848         switch (ip_block_version->type) {
1849         case AMD_IP_BLOCK_TYPE_VCN:
1850                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1851                         return 0;
1852                 break;
1853         case AMD_IP_BLOCK_TYPE_JPEG:
1854                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1855                         return 0;
1856                 break;
1857         default:
1858                 break;
1859         }
1860
1861         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1862                   ip_block_version->funcs->name);
1863
1864         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1865
1866         return 0;
1867 }
1868
1869 /**
1870  * amdgpu_device_enable_virtual_display - enable virtual display feature
1871  *
1872  * @adev: amdgpu_device pointer
1873  *
1874  * Enabled the virtual display feature if the user has enabled it via
1875  * the module parameter virtual_display.  This feature provides a virtual
1876  * display hardware on headless boards or in virtualized environments.
1877  * This function parses and validates the configuration string specified by
1878  * the user and configues the virtual display configuration (number of
1879  * virtual connectors, crtcs, etc.) specified.
1880  */
1881 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1882 {
1883         adev->enable_virtual_display = false;
1884
1885         if (amdgpu_virtual_display) {
1886                 const char *pci_address_name = pci_name(adev->pdev);
1887                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1888
1889                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1890                 pciaddstr_tmp = pciaddstr;
1891                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1892                         pciaddname = strsep(&pciaddname_tmp, ",");
1893                         if (!strcmp("all", pciaddname)
1894                             || !strcmp(pci_address_name, pciaddname)) {
1895                                 long num_crtc;
1896                                 int res = -1;
1897
1898                                 adev->enable_virtual_display = true;
1899
1900                                 if (pciaddname_tmp)
1901                                         res = kstrtol(pciaddname_tmp, 10,
1902                                                       &num_crtc);
1903
1904                                 if (!res) {
1905                                         if (num_crtc < 1)
1906                                                 num_crtc = 1;
1907                                         if (num_crtc > 6)
1908                                                 num_crtc = 6;
1909                                         adev->mode_info.num_crtc = num_crtc;
1910                                 } else {
1911                                         adev->mode_info.num_crtc = 1;
1912                                 }
1913                                 break;
1914                         }
1915                 }
1916
1917                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1918                          amdgpu_virtual_display, pci_address_name,
1919                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1920
1921                 kfree(pciaddstr);
1922         }
1923 }
1924
1925 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1926 {
1927         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1928                 adev->mode_info.num_crtc = 1;
1929                 adev->enable_virtual_display = true;
1930                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1931                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1932         }
1933 }
1934
1935 /**
1936  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1937  *
1938  * @adev: amdgpu_device pointer
1939  *
1940  * Parses the asic configuration parameters specified in the gpu info
1941  * firmware and makes them availale to the driver for use in configuring
1942  * the asic.
1943  * Returns 0 on success, -EINVAL on failure.
1944  */
1945 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1946 {
1947         const char *chip_name;
1948         char fw_name[40];
1949         int err;
1950         const struct gpu_info_firmware_header_v1_0 *hdr;
1951
1952         adev->firmware.gpu_info_fw = NULL;
1953
1954         if (adev->mman.discovery_bin) {
1955                 /*
1956                  * FIXME: The bounding box is still needed by Navi12, so
1957                  * temporarily read it from gpu_info firmware. Should be dropped
1958                  * when DAL no longer needs it.
1959                  */
1960                 if (adev->asic_type != CHIP_NAVI12)
1961                         return 0;
1962         }
1963
1964         switch (adev->asic_type) {
1965         default:
1966                 return 0;
1967         case CHIP_VEGA10:
1968                 chip_name = "vega10";
1969                 break;
1970         case CHIP_VEGA12:
1971                 chip_name = "vega12";
1972                 break;
1973         case CHIP_RAVEN:
1974                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1975                         chip_name = "raven2";
1976                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1977                         chip_name = "picasso";
1978                 else
1979                         chip_name = "raven";
1980                 break;
1981         case CHIP_ARCTURUS:
1982                 chip_name = "arcturus";
1983                 break;
1984         case CHIP_NAVI12:
1985                 chip_name = "navi12";
1986                 break;
1987         }
1988
1989         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1990         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1991         if (err) {
1992                 dev_err(adev->dev,
1993                         "Failed to get gpu_info firmware \"%s\"\n",
1994                         fw_name);
1995                 goto out;
1996         }
1997
1998         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1999         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2000
2001         switch (hdr->version_major) {
2002         case 1:
2003         {
2004                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2005                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2006                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2007
2008                 /*
2009                  * Should be droped when DAL no longer needs it.
2010                  */
2011                 if (adev->asic_type == CHIP_NAVI12)
2012                         goto parse_soc_bounding_box;
2013
2014                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2015                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2016                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2017                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2018                 adev->gfx.config.max_texture_channel_caches =
2019                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2020                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2021                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2022                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2023                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2024                 adev->gfx.config.double_offchip_lds_buf =
2025                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2026                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2027                 adev->gfx.cu_info.max_waves_per_simd =
2028                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2029                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2030                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2031                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2032                 if (hdr->version_minor >= 1) {
2033                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2034                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2035                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2036                         adev->gfx.config.num_sc_per_sh =
2037                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2038                         adev->gfx.config.num_packer_per_sc =
2039                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2040                 }
2041
2042 parse_soc_bounding_box:
2043                 /*
2044                  * soc bounding box info is not integrated in disocovery table,
2045                  * we always need to parse it from gpu info firmware if needed.
2046                  */
2047                 if (hdr->version_minor == 2) {
2048                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2049                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2050                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2051                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2052                 }
2053                 break;
2054         }
2055         default:
2056                 dev_err(adev->dev,
2057                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2058                 err = -EINVAL;
2059                 goto out;
2060         }
2061 out:
2062         return err;
2063 }
2064
2065 /**
2066  * amdgpu_device_ip_early_init - run early init for hardware IPs
2067  *
2068  * @adev: amdgpu_device pointer
2069  *
2070  * Early initialization pass for hardware IPs.  The hardware IPs that make
2071  * up each asic are discovered each IP's early_init callback is run.  This
2072  * is the first stage in initializing the asic.
2073  * Returns 0 on success, negative error code on failure.
2074  */
2075 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2076 {
2077         struct drm_device *dev = adev_to_drm(adev);
2078         struct pci_dev *parent;
2079         int i, r;
2080         bool total;
2081
2082         amdgpu_device_enable_virtual_display(adev);
2083
2084         if (amdgpu_sriov_vf(adev)) {
2085                 r = amdgpu_virt_request_full_gpu(adev, true);
2086                 if (r)
2087                         return r;
2088         }
2089
2090         switch (adev->asic_type) {
2091 #ifdef CONFIG_DRM_AMDGPU_SI
2092         case CHIP_VERDE:
2093         case CHIP_TAHITI:
2094         case CHIP_PITCAIRN:
2095         case CHIP_OLAND:
2096         case CHIP_HAINAN:
2097                 adev->family = AMDGPU_FAMILY_SI;
2098                 r = si_set_ip_blocks(adev);
2099                 if (r)
2100                         return r;
2101                 break;
2102 #endif
2103 #ifdef CONFIG_DRM_AMDGPU_CIK
2104         case CHIP_BONAIRE:
2105         case CHIP_HAWAII:
2106         case CHIP_KAVERI:
2107         case CHIP_KABINI:
2108         case CHIP_MULLINS:
2109                 if (adev->flags & AMD_IS_APU)
2110                         adev->family = AMDGPU_FAMILY_KV;
2111                 else
2112                         adev->family = AMDGPU_FAMILY_CI;
2113
2114                 r = cik_set_ip_blocks(adev);
2115                 if (r)
2116                         return r;
2117                 break;
2118 #endif
2119         case CHIP_TOPAZ:
2120         case CHIP_TONGA:
2121         case CHIP_FIJI:
2122         case CHIP_POLARIS10:
2123         case CHIP_POLARIS11:
2124         case CHIP_POLARIS12:
2125         case CHIP_VEGAM:
2126         case CHIP_CARRIZO:
2127         case CHIP_STONEY:
2128                 if (adev->flags & AMD_IS_APU)
2129                         adev->family = AMDGPU_FAMILY_CZ;
2130                 else
2131                         adev->family = AMDGPU_FAMILY_VI;
2132
2133                 r = vi_set_ip_blocks(adev);
2134                 if (r)
2135                         return r;
2136                 break;
2137         default:
2138                 r = amdgpu_discovery_set_ip_blocks(adev);
2139                 if (r)
2140                         return r;
2141                 break;
2142         }
2143
2144         if (amdgpu_has_atpx() &&
2145             (amdgpu_is_atpx_hybrid() ||
2146              amdgpu_has_atpx_dgpu_power_cntl()) &&
2147             ((adev->flags & AMD_IS_APU) == 0) &&
2148             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2149                 adev->flags |= AMD_IS_PX;
2150
2151         if (!(adev->flags & AMD_IS_APU)) {
2152                 parent = pci_upstream_bridge(adev->pdev);
2153                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2154         }
2155
2156         amdgpu_amdkfd_device_probe(adev);
2157
2158         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2159         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2160                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2161         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2162                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2163
2164         total = true;
2165         for (i = 0; i < adev->num_ip_blocks; i++) {
2166                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2167                         DRM_ERROR("disabled ip block: %d <%s>\n",
2168                                   i, adev->ip_blocks[i].version->funcs->name);
2169                         adev->ip_blocks[i].status.valid = false;
2170                 } else {
2171                         if (adev->ip_blocks[i].version->funcs->early_init) {
2172                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2173                                 if (r == -ENOENT) {
2174                                         adev->ip_blocks[i].status.valid = false;
2175                                 } else if (r) {
2176                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2177                                                   adev->ip_blocks[i].version->funcs->name, r);
2178                                         total = false;
2179                                 } else {
2180                                         adev->ip_blocks[i].status.valid = true;
2181                                 }
2182                         } else {
2183                                 adev->ip_blocks[i].status.valid = true;
2184                         }
2185                 }
2186                 /* get the vbios after the asic_funcs are set up */
2187                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2188                         r = amdgpu_device_parse_gpu_info_fw(adev);
2189                         if (r)
2190                                 return r;
2191
2192                         /* Read BIOS */
2193                         if (!amdgpu_get_bios(adev))
2194                                 return -EINVAL;
2195
2196                         r = amdgpu_atombios_init(adev);
2197                         if (r) {
2198                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2199                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2200                                 return r;
2201                         }
2202
2203                         /*get pf2vf msg info at it's earliest time*/
2204                         if (amdgpu_sriov_vf(adev))
2205                                 amdgpu_virt_init_data_exchange(adev);
2206
2207                 }
2208         }
2209         if (!total)
2210                 return -ENODEV;
2211
2212         adev->cg_flags &= amdgpu_cg_mask;
2213         adev->pg_flags &= amdgpu_pg_mask;
2214
2215         return 0;
2216 }
2217
2218 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2219 {
2220         int i, r;
2221
2222         for (i = 0; i < adev->num_ip_blocks; i++) {
2223                 if (!adev->ip_blocks[i].status.sw)
2224                         continue;
2225                 if (adev->ip_blocks[i].status.hw)
2226                         continue;
2227                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2228                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2229                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2230                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2231                         if (r) {
2232                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2233                                           adev->ip_blocks[i].version->funcs->name, r);
2234                                 return r;
2235                         }
2236                         adev->ip_blocks[i].status.hw = true;
2237                 }
2238         }
2239
2240         return 0;
2241 }
2242
2243 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2244 {
2245         int i, r;
2246
2247         for (i = 0; i < adev->num_ip_blocks; i++) {
2248                 if (!adev->ip_blocks[i].status.sw)
2249                         continue;
2250                 if (adev->ip_blocks[i].status.hw)
2251                         continue;
2252                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2253                 if (r) {
2254                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2255                                   adev->ip_blocks[i].version->funcs->name, r);
2256                         return r;
2257                 }
2258                 adev->ip_blocks[i].status.hw = true;
2259         }
2260
2261         return 0;
2262 }
2263
2264 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2265 {
2266         int r = 0;
2267         int i;
2268         uint32_t smu_version;
2269
2270         if (adev->asic_type >= CHIP_VEGA10) {
2271                 for (i = 0; i < adev->num_ip_blocks; i++) {
2272                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2273                                 continue;
2274
2275                         if (!adev->ip_blocks[i].status.sw)
2276                                 continue;
2277
2278                         /* no need to do the fw loading again if already done*/
2279                         if (adev->ip_blocks[i].status.hw == true)
2280                                 break;
2281
2282                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2283                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2284                                 if (r) {
2285                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2286                                                           adev->ip_blocks[i].version->funcs->name, r);
2287                                         return r;
2288                                 }
2289                         } else {
2290                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2291                                 if (r) {
2292                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2293                                                           adev->ip_blocks[i].version->funcs->name, r);
2294                                         return r;
2295                                 }
2296                         }
2297
2298                         adev->ip_blocks[i].status.hw = true;
2299                         break;
2300                 }
2301         }
2302
2303         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2304                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2305
2306         return r;
2307 }
2308
2309 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2310 {
2311         long timeout;
2312         int r, i;
2313
2314         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2315                 struct amdgpu_ring *ring = adev->rings[i];
2316
2317                 /* No need to setup the GPU scheduler for rings that don't need it */
2318                 if (!ring || ring->no_scheduler)
2319                         continue;
2320
2321                 switch (ring->funcs->type) {
2322                 case AMDGPU_RING_TYPE_GFX:
2323                         timeout = adev->gfx_timeout;
2324                         break;
2325                 case AMDGPU_RING_TYPE_COMPUTE:
2326                         timeout = adev->compute_timeout;
2327                         break;
2328                 case AMDGPU_RING_TYPE_SDMA:
2329                         timeout = adev->sdma_timeout;
2330                         break;
2331                 default:
2332                         timeout = adev->video_timeout;
2333                         break;
2334                 }
2335
2336                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2337                                    ring->num_hw_submission, amdgpu_job_hang_limit,
2338                                    timeout, adev->reset_domain->wq,
2339                                    ring->sched_score, ring->name,
2340                                    adev->dev);
2341                 if (r) {
2342                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2343                                   ring->name);
2344                         return r;
2345                 }
2346         }
2347
2348         return 0;
2349 }
2350
2351
2352 /**
2353  * amdgpu_device_ip_init - run init for hardware IPs
2354  *
2355  * @adev: amdgpu_device pointer
2356  *
2357  * Main initialization pass for hardware IPs.  The list of all the hardware
2358  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2359  * are run.  sw_init initializes the software state associated with each IP
2360  * and hw_init initializes the hardware associated with each IP.
2361  * Returns 0 on success, negative error code on failure.
2362  */
2363 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2364 {
2365         int i, r;
2366
2367         r = amdgpu_ras_init(adev);
2368         if (r)
2369                 return r;
2370
2371         for (i = 0; i < adev->num_ip_blocks; i++) {
2372                 if (!adev->ip_blocks[i].status.valid)
2373                         continue;
2374                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2375                 if (r) {
2376                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2377                                   adev->ip_blocks[i].version->funcs->name, r);
2378                         goto init_failed;
2379                 }
2380                 adev->ip_blocks[i].status.sw = true;
2381
2382                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2383                         /* need to do common hw init early so everything is set up for gmc */
2384                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2385                         if (r) {
2386                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2387                                 goto init_failed;
2388                         }
2389                         adev->ip_blocks[i].status.hw = true;
2390                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2391                         /* need to do gmc hw init early so we can allocate gpu mem */
2392                         /* Try to reserve bad pages early */
2393                         if (amdgpu_sriov_vf(adev))
2394                                 amdgpu_virt_exchange_data(adev);
2395
2396                         r = amdgpu_device_mem_scratch_init(adev);
2397                         if (r) {
2398                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2399                                 goto init_failed;
2400                         }
2401                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2402                         if (r) {
2403                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2404                                 goto init_failed;
2405                         }
2406                         r = amdgpu_device_wb_init(adev);
2407                         if (r) {
2408                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2409                                 goto init_failed;
2410                         }
2411                         adev->ip_blocks[i].status.hw = true;
2412
2413                         /* right after GMC hw init, we create CSA */
2414                         if (amdgpu_mcbp) {
2415                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2416                                                                AMDGPU_GEM_DOMAIN_VRAM |
2417                                                                AMDGPU_GEM_DOMAIN_GTT,
2418                                                                AMDGPU_CSA_SIZE);
2419                                 if (r) {
2420                                         DRM_ERROR("allocate CSA failed %d\n", r);
2421                                         goto init_failed;
2422                                 }
2423                         }
2424                 }
2425         }
2426
2427         if (amdgpu_sriov_vf(adev))
2428                 amdgpu_virt_init_data_exchange(adev);
2429
2430         r = amdgpu_ib_pool_init(adev);
2431         if (r) {
2432                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2433                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2434                 goto init_failed;
2435         }
2436
2437         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2438         if (r)
2439                 goto init_failed;
2440
2441         r = amdgpu_device_ip_hw_init_phase1(adev);
2442         if (r)
2443                 goto init_failed;
2444
2445         r = amdgpu_device_fw_loading(adev);
2446         if (r)
2447                 goto init_failed;
2448
2449         r = amdgpu_device_ip_hw_init_phase2(adev);
2450         if (r)
2451                 goto init_failed;
2452
2453         /*
2454          * retired pages will be loaded from eeprom and reserved here,
2455          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2456          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2457          * for I2C communication which only true at this point.
2458          *
2459          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2460          * failure from bad gpu situation and stop amdgpu init process
2461          * accordingly. For other failed cases, it will still release all
2462          * the resource and print error message, rather than returning one
2463          * negative value to upper level.
2464          *
2465          * Note: theoretically, this should be called before all vram allocations
2466          * to protect retired page from abusing
2467          */
2468         r = amdgpu_ras_recovery_init(adev);
2469         if (r)
2470                 goto init_failed;
2471
2472         /**
2473          * In case of XGMI grab extra reference for reset domain for this device
2474          */
2475         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2476                 if (amdgpu_xgmi_add_device(adev) == 0) {
2477                         if (!amdgpu_sriov_vf(adev)) {
2478                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2479
2480                                 if (WARN_ON(!hive)) {
2481                                         r = -ENOENT;
2482                                         goto init_failed;
2483                                 }
2484
2485                                 if (!hive->reset_domain ||
2486                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2487                                         r = -ENOENT;
2488                                         amdgpu_put_xgmi_hive(hive);
2489                                         goto init_failed;
2490                                 }
2491
2492                                 /* Drop the early temporary reset domain we created for device */
2493                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2494                                 adev->reset_domain = hive->reset_domain;
2495                                 amdgpu_put_xgmi_hive(hive);
2496                         }
2497                 }
2498         }
2499
2500         r = amdgpu_device_init_schedulers(adev);
2501         if (r)
2502                 goto init_failed;
2503
2504         /* Don't init kfd if whole hive need to be reset during init */
2505         if (!adev->gmc.xgmi.pending_reset)
2506                 amdgpu_amdkfd_device_init(adev);
2507
2508         amdgpu_fru_get_product_info(adev);
2509
2510 init_failed:
2511         if (amdgpu_sriov_vf(adev))
2512                 amdgpu_virt_release_full_gpu(adev, true);
2513
2514         return r;
2515 }
2516
2517 /**
2518  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2519  *
2520  * @adev: amdgpu_device pointer
2521  *
2522  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2523  * this function before a GPU reset.  If the value is retained after a
2524  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2525  */
2526 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2527 {
2528         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2529 }
2530
2531 /**
2532  * amdgpu_device_check_vram_lost - check if vram is valid
2533  *
2534  * @adev: amdgpu_device pointer
2535  *
2536  * Checks the reset magic value written to the gart pointer in VRAM.
2537  * The driver calls this after a GPU reset to see if the contents of
2538  * VRAM is lost or now.
2539  * returns true if vram is lost, false if not.
2540  */
2541 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2542 {
2543         if (memcmp(adev->gart.ptr, adev->reset_magic,
2544                         AMDGPU_RESET_MAGIC_NUM))
2545                 return true;
2546
2547         if (!amdgpu_in_reset(adev))
2548                 return false;
2549
2550         /*
2551          * For all ASICs with baco/mode1 reset, the VRAM is
2552          * always assumed to be lost.
2553          */
2554         switch (amdgpu_asic_reset_method(adev)) {
2555         case AMD_RESET_METHOD_BACO:
2556         case AMD_RESET_METHOD_MODE1:
2557                 return true;
2558         default:
2559                 return false;
2560         }
2561 }
2562
2563 /**
2564  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2565  *
2566  * @adev: amdgpu_device pointer
2567  * @state: clockgating state (gate or ungate)
2568  *
2569  * The list of all the hardware IPs that make up the asic is walked and the
2570  * set_clockgating_state callbacks are run.
2571  * Late initialization pass enabling clockgating for hardware IPs.
2572  * Fini or suspend, pass disabling clockgating for hardware IPs.
2573  * Returns 0 on success, negative error code on failure.
2574  */
2575
2576 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2577                                enum amd_clockgating_state state)
2578 {
2579         int i, j, r;
2580
2581         if (amdgpu_emu_mode == 1)
2582                 return 0;
2583
2584         for (j = 0; j < adev->num_ip_blocks; j++) {
2585                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2586                 if (!adev->ip_blocks[i].status.late_initialized)
2587                         continue;
2588                 /* skip CG for GFX, SDMA on S0ix */
2589                 if (adev->in_s0ix &&
2590                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2591                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2592                         continue;
2593                 /* skip CG for VCE/UVD, it's handled specially */
2594                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2595                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2596                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2597                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2598                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2599                         /* enable clockgating to save power */
2600                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2601                                                                                      state);
2602                         if (r) {
2603                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2604                                           adev->ip_blocks[i].version->funcs->name, r);
2605                                 return r;
2606                         }
2607                 }
2608         }
2609
2610         return 0;
2611 }
2612
2613 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2614                                enum amd_powergating_state state)
2615 {
2616         int i, j, r;
2617
2618         if (amdgpu_emu_mode == 1)
2619                 return 0;
2620
2621         for (j = 0; j < adev->num_ip_blocks; j++) {
2622                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2623                 if (!adev->ip_blocks[i].status.late_initialized)
2624                         continue;
2625                 /* skip PG for GFX, SDMA on S0ix */
2626                 if (adev->in_s0ix &&
2627                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2628                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2629                         continue;
2630                 /* skip CG for VCE/UVD, it's handled specially */
2631                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2632                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2633                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2634                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2635                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2636                         /* enable powergating to save power */
2637                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2638                                                                                         state);
2639                         if (r) {
2640                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2641                                           adev->ip_blocks[i].version->funcs->name, r);
2642                                 return r;
2643                         }
2644                 }
2645         }
2646         return 0;
2647 }
2648
2649 static int amdgpu_device_enable_mgpu_fan_boost(void)
2650 {
2651         struct amdgpu_gpu_instance *gpu_ins;
2652         struct amdgpu_device *adev;
2653         int i, ret = 0;
2654
2655         mutex_lock(&mgpu_info.mutex);
2656
2657         /*
2658          * MGPU fan boost feature should be enabled
2659          * only when there are two or more dGPUs in
2660          * the system
2661          */
2662         if (mgpu_info.num_dgpu < 2)
2663                 goto out;
2664
2665         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2666                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2667                 adev = gpu_ins->adev;
2668                 if (!(adev->flags & AMD_IS_APU) &&
2669                     !gpu_ins->mgpu_fan_enabled) {
2670                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2671                         if (ret)
2672                                 break;
2673
2674                         gpu_ins->mgpu_fan_enabled = 1;
2675                 }
2676         }
2677
2678 out:
2679         mutex_unlock(&mgpu_info.mutex);
2680
2681         return ret;
2682 }
2683
2684 /**
2685  * amdgpu_device_ip_late_init - run late init for hardware IPs
2686  *
2687  * @adev: amdgpu_device pointer
2688  *
2689  * Late initialization pass for hardware IPs.  The list of all the hardware
2690  * IPs that make up the asic is walked and the late_init callbacks are run.
2691  * late_init covers any special initialization that an IP requires
2692  * after all of the have been initialized or something that needs to happen
2693  * late in the init process.
2694  * Returns 0 on success, negative error code on failure.
2695  */
2696 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2697 {
2698         struct amdgpu_gpu_instance *gpu_instance;
2699         int i = 0, r;
2700
2701         for (i = 0; i < adev->num_ip_blocks; i++) {
2702                 if (!adev->ip_blocks[i].status.hw)
2703                         continue;
2704                 if (adev->ip_blocks[i].version->funcs->late_init) {
2705                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2706                         if (r) {
2707                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2708                                           adev->ip_blocks[i].version->funcs->name, r);
2709                                 return r;
2710                         }
2711                 }
2712                 adev->ip_blocks[i].status.late_initialized = true;
2713         }
2714
2715         r = amdgpu_ras_late_init(adev);
2716         if (r) {
2717                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2718                 return r;
2719         }
2720
2721         amdgpu_ras_set_error_query_ready(adev, true);
2722
2723         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2724         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2725
2726         amdgpu_device_fill_reset_magic(adev);
2727
2728         r = amdgpu_device_enable_mgpu_fan_boost();
2729         if (r)
2730                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2731
2732         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2733         if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2734                                adev->asic_type == CHIP_ALDEBARAN ))
2735                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2736
2737         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2738                 mutex_lock(&mgpu_info.mutex);
2739
2740                 /*
2741                  * Reset device p-state to low as this was booted with high.
2742                  *
2743                  * This should be performed only after all devices from the same
2744                  * hive get initialized.
2745                  *
2746                  * However, it's unknown how many device in the hive in advance.
2747                  * As this is counted one by one during devices initializations.
2748                  *
2749                  * So, we wait for all XGMI interlinked devices initialized.
2750                  * This may bring some delays as those devices may come from
2751                  * different hives. But that should be OK.
2752                  */
2753                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2754                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2755                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2756                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2757                                         continue;
2758
2759                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2760                                                 AMDGPU_XGMI_PSTATE_MIN);
2761                                 if (r) {
2762                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2763                                         break;
2764                                 }
2765                         }
2766                 }
2767
2768                 mutex_unlock(&mgpu_info.mutex);
2769         }
2770
2771         return 0;
2772 }
2773
2774 /**
2775  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2776  *
2777  * @adev: amdgpu_device pointer
2778  *
2779  * For ASICs need to disable SMC first
2780  */
2781 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2782 {
2783         int i, r;
2784
2785         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2786                 return;
2787
2788         for (i = 0; i < adev->num_ip_blocks; i++) {
2789                 if (!adev->ip_blocks[i].status.hw)
2790                         continue;
2791                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2792                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2793                         /* XXX handle errors */
2794                         if (r) {
2795                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2796                                           adev->ip_blocks[i].version->funcs->name, r);
2797                         }
2798                         adev->ip_blocks[i].status.hw = false;
2799                         break;
2800                 }
2801         }
2802 }
2803
2804 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2805 {
2806         int i, r;
2807
2808         for (i = 0; i < adev->num_ip_blocks; i++) {
2809                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2810                         continue;
2811
2812                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2813                 if (r) {
2814                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2815                                   adev->ip_blocks[i].version->funcs->name, r);
2816                 }
2817         }
2818
2819         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2820         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2821
2822         amdgpu_amdkfd_suspend(adev, false);
2823
2824         /* Workaroud for ASICs need to disable SMC first */
2825         amdgpu_device_smu_fini_early(adev);
2826
2827         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2828                 if (!adev->ip_blocks[i].status.hw)
2829                         continue;
2830
2831                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2832                 /* XXX handle errors */
2833                 if (r) {
2834                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2835                                   adev->ip_blocks[i].version->funcs->name, r);
2836                 }
2837
2838                 adev->ip_blocks[i].status.hw = false;
2839         }
2840
2841         if (amdgpu_sriov_vf(adev)) {
2842                 if (amdgpu_virt_release_full_gpu(adev, false))
2843                         DRM_ERROR("failed to release exclusive mode on fini\n");
2844         }
2845
2846         return 0;
2847 }
2848
2849 /**
2850  * amdgpu_device_ip_fini - run fini for hardware IPs
2851  *
2852  * @adev: amdgpu_device pointer
2853  *
2854  * Main teardown pass for hardware IPs.  The list of all the hardware
2855  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2856  * are run.  hw_fini tears down the hardware associated with each IP
2857  * and sw_fini tears down any software state associated with each IP.
2858  * Returns 0 on success, negative error code on failure.
2859  */
2860 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2861 {
2862         int i, r;
2863
2864         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2865                 amdgpu_virt_release_ras_err_handler_data(adev);
2866
2867         if (adev->gmc.xgmi.num_physical_nodes > 1)
2868                 amdgpu_xgmi_remove_device(adev);
2869
2870         amdgpu_amdkfd_device_fini_sw(adev);
2871
2872         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2873                 if (!adev->ip_blocks[i].status.sw)
2874                         continue;
2875
2876                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2877                         amdgpu_ucode_free_bo(adev);
2878                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2879                         amdgpu_device_wb_fini(adev);
2880                         amdgpu_device_mem_scratch_fini(adev);
2881                         amdgpu_ib_pool_fini(adev);
2882                 }
2883
2884                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2885                 /* XXX handle errors */
2886                 if (r) {
2887                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2888                                   adev->ip_blocks[i].version->funcs->name, r);
2889                 }
2890                 adev->ip_blocks[i].status.sw = false;
2891                 adev->ip_blocks[i].status.valid = false;
2892         }
2893
2894         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2895                 if (!adev->ip_blocks[i].status.late_initialized)
2896                         continue;
2897                 if (adev->ip_blocks[i].version->funcs->late_fini)
2898                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2899                 adev->ip_blocks[i].status.late_initialized = false;
2900         }
2901
2902         amdgpu_ras_fini(adev);
2903
2904         return 0;
2905 }
2906
2907 /**
2908  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2909  *
2910  * @work: work_struct.
2911  */
2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2913 {
2914         struct amdgpu_device *adev =
2915                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2916         int r;
2917
2918         r = amdgpu_ib_ring_tests(adev);
2919         if (r)
2920                 DRM_ERROR("ib ring test failed (%d).\n", r);
2921 }
2922
2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2924 {
2925         struct amdgpu_device *adev =
2926                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2927
2928         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2929         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2930
2931         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2932                 adev->gfx.gfx_off_state = true;
2933 }
2934
2935 /**
2936  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2937  *
2938  * @adev: amdgpu_device pointer
2939  *
2940  * Main suspend function for hardware IPs.  The list of all the hardware
2941  * IPs that make up the asic is walked, clockgating is disabled and the
2942  * suspend callbacks are run.  suspend puts the hardware and software state
2943  * in each IP into a state suitable for suspend.
2944  * Returns 0 on success, negative error code on failure.
2945  */
2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2947 {
2948         int i, r;
2949
2950         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2951         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2952
2953         /*
2954          * Per PMFW team's suggestion, driver needs to handle gfxoff
2955          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2956          * scenario. Add the missing df cstate disablement here.
2957          */
2958         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2959                 dev_warn(adev->dev, "Failed to disallow df cstate");
2960
2961         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2962                 if (!adev->ip_blocks[i].status.valid)
2963                         continue;
2964
2965                 /* displays are handled separately */
2966                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2967                         continue;
2968
2969                 /* XXX handle errors */
2970                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2971                 /* XXX handle errors */
2972                 if (r) {
2973                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2974                                   adev->ip_blocks[i].version->funcs->name, r);
2975                         return r;
2976                 }
2977
2978                 adev->ip_blocks[i].status.hw = false;
2979         }
2980
2981         return 0;
2982 }
2983
2984 /**
2985  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2986  *
2987  * @adev: amdgpu_device pointer
2988  *
2989  * Main suspend function for hardware IPs.  The list of all the hardware
2990  * IPs that make up the asic is walked, clockgating is disabled and the
2991  * suspend callbacks are run.  suspend puts the hardware and software state
2992  * in each IP into a state suitable for suspend.
2993  * Returns 0 on success, negative error code on failure.
2994  */
2995 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2996 {
2997         int i, r;
2998
2999         if (adev->in_s0ix)
3000                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3001
3002         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3003                 if (!adev->ip_blocks[i].status.valid)
3004                         continue;
3005                 /* displays are handled in phase1 */
3006                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3007                         continue;
3008                 /* PSP lost connection when err_event_athub occurs */
3009                 if (amdgpu_ras_intr_triggered() &&
3010                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3011                         adev->ip_blocks[i].status.hw = false;
3012                         continue;
3013                 }
3014
3015                 /* skip unnecessary suspend if we do not initialize them yet */
3016                 if (adev->gmc.xgmi.pending_reset &&
3017                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3018                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3019                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3020                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3021                         adev->ip_blocks[i].status.hw = false;
3022                         continue;
3023                 }
3024
3025                 /* skip suspend of gfx/mes and psp for S0ix
3026                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3027                  * like at runtime. PSP is also part of the always on hardware
3028                  * so no need to suspend it.
3029                  */
3030                 if (adev->in_s0ix &&
3031                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3032                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3033                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3034                         continue;
3035
3036                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3037                 if (adev->in_s0ix &&
3038                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3039                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3040                         continue;
3041
3042                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3043                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3044                  * from this location and RLC Autoload automatically also gets loaded
3045                  * from here based on PMFW -> PSP message during re-init sequence.
3046                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
3047                  * the TMR and reload FWs again for IMU enabled APU ASICs.
3048                  */
3049                 if (amdgpu_in_reset(adev) &&
3050                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3051                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3052                         continue;
3053
3054                 /* XXX handle errors */
3055                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3056                 /* XXX handle errors */
3057                 if (r) {
3058                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3059                                   adev->ip_blocks[i].version->funcs->name, r);
3060                 }
3061                 adev->ip_blocks[i].status.hw = false;
3062                 /* handle putting the SMC in the appropriate state */
3063                 if(!amdgpu_sriov_vf(adev)){
3064                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3065                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3066                                 if (r) {
3067                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3068                                                         adev->mp1_state, r);
3069                                         return r;
3070                                 }
3071                         }
3072                 }
3073         }
3074
3075         return 0;
3076 }
3077
3078 /**
3079  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3080  *
3081  * @adev: amdgpu_device pointer
3082  *
3083  * Main suspend function for hardware IPs.  The list of all the hardware
3084  * IPs that make up the asic is walked, clockgating is disabled and the
3085  * suspend callbacks are run.  suspend puts the hardware and software state
3086  * in each IP into a state suitable for suspend.
3087  * Returns 0 on success, negative error code on failure.
3088  */
3089 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3090 {
3091         int r;
3092
3093         if (amdgpu_sriov_vf(adev)) {
3094                 amdgpu_virt_fini_data_exchange(adev);
3095                 amdgpu_virt_request_full_gpu(adev, false);
3096         }
3097
3098         r = amdgpu_device_ip_suspend_phase1(adev);
3099         if (r)
3100                 return r;
3101         r = amdgpu_device_ip_suspend_phase2(adev);
3102
3103         if (amdgpu_sriov_vf(adev))
3104                 amdgpu_virt_release_full_gpu(adev, false);
3105
3106         return r;
3107 }
3108
3109 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3110 {
3111         int i, r;
3112
3113         static enum amd_ip_block_type ip_order[] = {
3114                 AMD_IP_BLOCK_TYPE_COMMON,
3115                 AMD_IP_BLOCK_TYPE_GMC,
3116                 AMD_IP_BLOCK_TYPE_PSP,
3117                 AMD_IP_BLOCK_TYPE_IH,
3118         };
3119
3120         for (i = 0; i < adev->num_ip_blocks; i++) {
3121                 int j;
3122                 struct amdgpu_ip_block *block;
3123
3124                 block = &adev->ip_blocks[i];
3125                 block->status.hw = false;
3126
3127                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3128
3129                         if (block->version->type != ip_order[j] ||
3130                                 !block->status.valid)
3131                                 continue;
3132
3133                         r = block->version->funcs->hw_init(adev);
3134                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3135                         if (r)
3136                                 return r;
3137                         block->status.hw = true;
3138                 }
3139         }
3140
3141         return 0;
3142 }
3143
3144 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3145 {
3146         int i, r;
3147
3148         static enum amd_ip_block_type ip_order[] = {
3149                 AMD_IP_BLOCK_TYPE_SMC,
3150                 AMD_IP_BLOCK_TYPE_DCE,
3151                 AMD_IP_BLOCK_TYPE_GFX,
3152                 AMD_IP_BLOCK_TYPE_SDMA,
3153                 AMD_IP_BLOCK_TYPE_UVD,
3154                 AMD_IP_BLOCK_TYPE_VCE,
3155                 AMD_IP_BLOCK_TYPE_VCN
3156         };
3157
3158         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3159                 int j;
3160                 struct amdgpu_ip_block *block;
3161
3162                 for (j = 0; j < adev->num_ip_blocks; j++) {
3163                         block = &adev->ip_blocks[j];
3164
3165                         if (block->version->type != ip_order[i] ||
3166                                 !block->status.valid ||
3167                                 block->status.hw)
3168                                 continue;
3169
3170                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3171                                 r = block->version->funcs->resume(adev);
3172                         else
3173                                 r = block->version->funcs->hw_init(adev);
3174
3175                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3176                         if (r)
3177                                 return r;
3178                         block->status.hw = true;
3179                 }
3180         }
3181
3182         return 0;
3183 }
3184
3185 /**
3186  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3187  *
3188  * @adev: amdgpu_device pointer
3189  *
3190  * First resume function for hardware IPs.  The list of all the hardware
3191  * IPs that make up the asic is walked and the resume callbacks are run for
3192  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3193  * after a suspend and updates the software state as necessary.  This
3194  * function is also used for restoring the GPU after a GPU reset.
3195  * Returns 0 on success, negative error code on failure.
3196  */
3197 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3198 {
3199         int i, r;
3200
3201         for (i = 0; i < adev->num_ip_blocks; i++) {
3202                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3203                         continue;
3204                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3205                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3206                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3207                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3208
3209                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3210                         if (r) {
3211                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3212                                           adev->ip_blocks[i].version->funcs->name, r);
3213                                 return r;
3214                         }
3215                         adev->ip_blocks[i].status.hw = true;
3216                 }
3217         }
3218
3219         return 0;
3220 }
3221
3222 /**
3223  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3224  *
3225  * @adev: amdgpu_device pointer
3226  *
3227  * First resume function for hardware IPs.  The list of all the hardware
3228  * IPs that make up the asic is walked and the resume callbacks are run for
3229  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3230  * functional state after a suspend and updates the software state as
3231  * necessary.  This function is also used for restoring the GPU after a GPU
3232  * reset.
3233  * Returns 0 on success, negative error code on failure.
3234  */
3235 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3236 {
3237         int i, r;
3238
3239         for (i = 0; i < adev->num_ip_blocks; i++) {
3240                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3241                         continue;
3242                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3243                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3244                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3245                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3246                         continue;
3247                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3248                 if (r) {
3249                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3250                                   adev->ip_blocks[i].version->funcs->name, r);
3251                         return r;
3252                 }
3253                 adev->ip_blocks[i].status.hw = true;
3254         }
3255
3256         return 0;
3257 }
3258
3259 /**
3260  * amdgpu_device_ip_resume - run resume for hardware IPs
3261  *
3262  * @adev: amdgpu_device pointer
3263  *
3264  * Main resume function for hardware IPs.  The hardware IPs
3265  * are split into two resume functions because they are
3266  * are also used in in recovering from a GPU reset and some additional
3267  * steps need to be take between them.  In this case (S3/S4) they are
3268  * run sequentially.
3269  * Returns 0 on success, negative error code on failure.
3270  */
3271 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3272 {
3273         int r;
3274
3275         r = amdgpu_amdkfd_resume_iommu(adev);
3276         if (r)
3277                 return r;
3278
3279         r = amdgpu_device_ip_resume_phase1(adev);
3280         if (r)
3281                 return r;
3282
3283         r = amdgpu_device_fw_loading(adev);
3284         if (r)
3285                 return r;
3286
3287         r = amdgpu_device_ip_resume_phase2(adev);
3288
3289         return r;
3290 }
3291
3292 /**
3293  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3294  *
3295  * @adev: amdgpu_device pointer
3296  *
3297  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3298  */
3299 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3300 {
3301         if (amdgpu_sriov_vf(adev)) {
3302                 if (adev->is_atom_fw) {
3303                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3304                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3305                 } else {
3306                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3307                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3308                 }
3309
3310                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3311                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3312         }
3313 }
3314
3315 /**
3316  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3317  *
3318  * @asic_type: AMD asic type
3319  *
3320  * Check if there is DC (new modesetting infrastructre) support for an asic.
3321  * returns true if DC has support, false if not.
3322  */
3323 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3324 {
3325         switch (asic_type) {
3326 #ifdef CONFIG_DRM_AMDGPU_SI
3327         case CHIP_HAINAN:
3328 #endif
3329         case CHIP_TOPAZ:
3330                 /* chips with no display hardware */
3331                 return false;
3332 #if defined(CONFIG_DRM_AMD_DC)
3333         case CHIP_TAHITI:
3334         case CHIP_PITCAIRN:
3335         case CHIP_VERDE:
3336         case CHIP_OLAND:
3337                 /*
3338                  * We have systems in the wild with these ASICs that require
3339                  * LVDS and VGA support which is not supported with DC.
3340                  *
3341                  * Fallback to the non-DC driver here by default so as not to
3342                  * cause regressions.
3343                  */
3344 #if defined(CONFIG_DRM_AMD_DC_SI)
3345                 return amdgpu_dc > 0;
3346 #else
3347                 return false;
3348 #endif
3349         case CHIP_BONAIRE:
3350         case CHIP_KAVERI:
3351         case CHIP_KABINI:
3352         case CHIP_MULLINS:
3353                 /*
3354                  * We have systems in the wild with these ASICs that require
3355                  * VGA support which is not supported with DC.
3356                  *
3357                  * Fallback to the non-DC driver here by default so as not to
3358                  * cause regressions.
3359                  */
3360                 return amdgpu_dc > 0;
3361         default:
3362                 return amdgpu_dc != 0;
3363 #else
3364         default:
3365                 if (amdgpu_dc > 0)
3366                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3367                                          "but isn't supported by ASIC, ignoring\n");
3368                 return false;
3369 #endif
3370         }
3371 }
3372
3373 /**
3374  * amdgpu_device_has_dc_support - check if dc is supported
3375  *
3376  * @adev: amdgpu_device pointer
3377  *
3378  * Returns true for supported, false for not supported
3379  */
3380 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3381 {
3382         if (adev->enable_virtual_display ||
3383             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3384                 return false;
3385
3386         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3387 }
3388
3389 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3390 {
3391         struct amdgpu_device *adev =
3392                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3393         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3394
3395         /* It's a bug to not have a hive within this function */
3396         if (WARN_ON(!hive))
3397                 return;
3398
3399         /*
3400          * Use task barrier to synchronize all xgmi reset works across the
3401          * hive. task_barrier_enter and task_barrier_exit will block
3402          * until all the threads running the xgmi reset works reach
3403          * those points. task_barrier_full will do both blocks.
3404          */
3405         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3406
3407                 task_barrier_enter(&hive->tb);
3408                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3409
3410                 if (adev->asic_reset_res)
3411                         goto fail;
3412
3413                 task_barrier_exit(&hive->tb);
3414                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3415
3416                 if (adev->asic_reset_res)
3417                         goto fail;
3418
3419                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3420                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3421                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3422         } else {
3423
3424                 task_barrier_full(&hive->tb);
3425                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3426         }
3427
3428 fail:
3429         if (adev->asic_reset_res)
3430                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3431                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3432         amdgpu_put_xgmi_hive(hive);
3433 }
3434
3435 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3436 {
3437         char *input = amdgpu_lockup_timeout;
3438         char *timeout_setting = NULL;
3439         int index = 0;
3440         long timeout;
3441         int ret = 0;
3442
3443         /*
3444          * By default timeout for non compute jobs is 10000
3445          * and 60000 for compute jobs.
3446          * In SR-IOV or passthrough mode, timeout for compute
3447          * jobs are 60000 by default.
3448          */
3449         adev->gfx_timeout = msecs_to_jiffies(10000);
3450         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3451         if (amdgpu_sriov_vf(adev))
3452                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3453                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3454         else
3455                 adev->compute_timeout =  msecs_to_jiffies(60000);
3456
3457         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3458                 while ((timeout_setting = strsep(&input, ",")) &&
3459                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3460                         ret = kstrtol(timeout_setting, 0, &timeout);
3461                         if (ret)
3462                                 return ret;
3463
3464                         if (timeout == 0) {
3465                                 index++;
3466                                 continue;
3467                         } else if (timeout < 0) {
3468                                 timeout = MAX_SCHEDULE_TIMEOUT;
3469                                 dev_warn(adev->dev, "lockup timeout disabled");
3470                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3471                         } else {
3472                                 timeout = msecs_to_jiffies(timeout);
3473                         }
3474
3475                         switch (index++) {
3476                         case 0:
3477                                 adev->gfx_timeout = timeout;
3478                                 break;
3479                         case 1:
3480                                 adev->compute_timeout = timeout;
3481                                 break;
3482                         case 2:
3483                                 adev->sdma_timeout = timeout;
3484                                 break;
3485                         case 3:
3486                                 adev->video_timeout = timeout;
3487                                 break;
3488                         default:
3489                                 break;
3490                         }
3491                 }
3492                 /*
3493                  * There is only one value specified and
3494                  * it should apply to all non-compute jobs.
3495                  */
3496                 if (index == 1) {
3497                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3498                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3499                                 adev->compute_timeout = adev->gfx_timeout;
3500                 }
3501         }
3502
3503         return ret;
3504 }
3505
3506 /**
3507  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3508  *
3509  * @adev: amdgpu_device pointer
3510  *
3511  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3512  */
3513 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3514 {
3515         struct iommu_domain *domain;
3516
3517         domain = iommu_get_domain_for_dev(adev->dev);
3518         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3519                 adev->ram_is_direct_mapped = true;
3520 }
3521
3522 static const struct attribute *amdgpu_dev_attributes[] = {
3523         &dev_attr_product_name.attr,
3524         &dev_attr_product_number.attr,
3525         &dev_attr_serial_number.attr,
3526         &dev_attr_pcie_replay_count.attr,
3527         NULL
3528 };
3529
3530 /**
3531  * amdgpu_device_init - initialize the driver
3532  *
3533  * @adev: amdgpu_device pointer
3534  * @flags: driver flags
3535  *
3536  * Initializes the driver info and hw (all asics).
3537  * Returns 0 for success or an error on failure.
3538  * Called at driver startup.
3539  */
3540 int amdgpu_device_init(struct amdgpu_device *adev,
3541                        uint32_t flags)
3542 {
3543         struct drm_device *ddev = adev_to_drm(adev);
3544         struct pci_dev *pdev = adev->pdev;
3545         int r, i;
3546         bool px = false;
3547         u32 max_MBps;
3548
3549         adev->shutdown = false;
3550         adev->flags = flags;
3551
3552         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3553                 adev->asic_type = amdgpu_force_asic_type;
3554         else
3555                 adev->asic_type = flags & AMD_ASIC_MASK;
3556
3557         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3558         if (amdgpu_emu_mode == 1)
3559                 adev->usec_timeout *= 10;
3560         adev->gmc.gart_size = 512 * 1024 * 1024;
3561         adev->accel_working = false;
3562         adev->num_rings = 0;
3563         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3564         adev->mman.buffer_funcs = NULL;
3565         adev->mman.buffer_funcs_ring = NULL;
3566         adev->vm_manager.vm_pte_funcs = NULL;
3567         adev->vm_manager.vm_pte_num_scheds = 0;
3568         adev->gmc.gmc_funcs = NULL;
3569         adev->harvest_ip_mask = 0x0;
3570         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3571         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3572
3573         adev->smc_rreg = &amdgpu_invalid_rreg;
3574         adev->smc_wreg = &amdgpu_invalid_wreg;
3575         adev->pcie_rreg = &amdgpu_invalid_rreg;
3576         adev->pcie_wreg = &amdgpu_invalid_wreg;
3577         adev->pciep_rreg = &amdgpu_invalid_rreg;
3578         adev->pciep_wreg = &amdgpu_invalid_wreg;
3579         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3580         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3581         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3582         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3583         adev->didt_rreg = &amdgpu_invalid_rreg;
3584         adev->didt_wreg = &amdgpu_invalid_wreg;
3585         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3586         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3587         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3588         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3589
3590         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3591                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3592                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3593
3594         /* mutex initialization are all done here so we
3595          * can recall function without having locking issues */
3596         mutex_init(&adev->firmware.mutex);
3597         mutex_init(&adev->pm.mutex);
3598         mutex_init(&adev->gfx.gpu_clock_mutex);
3599         mutex_init(&adev->srbm_mutex);
3600         mutex_init(&adev->gfx.pipe_reserve_mutex);
3601         mutex_init(&adev->gfx.gfx_off_mutex);
3602         mutex_init(&adev->grbm_idx_mutex);
3603         mutex_init(&adev->mn_lock);
3604         mutex_init(&adev->virt.vf_errors.lock);
3605         hash_init(adev->mn_hash);
3606         mutex_init(&adev->psp.mutex);
3607         mutex_init(&adev->notifier_lock);
3608         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3609         mutex_init(&adev->benchmark_mutex);
3610
3611         amdgpu_device_init_apu_flags(adev);
3612
3613         r = amdgpu_device_check_arguments(adev);
3614         if (r)
3615                 return r;
3616
3617         spin_lock_init(&adev->mmio_idx_lock);
3618         spin_lock_init(&adev->smc_idx_lock);
3619         spin_lock_init(&adev->pcie_idx_lock);
3620         spin_lock_init(&adev->uvd_ctx_idx_lock);
3621         spin_lock_init(&adev->didt_idx_lock);
3622         spin_lock_init(&adev->gc_cac_idx_lock);
3623         spin_lock_init(&adev->se_cac_idx_lock);
3624         spin_lock_init(&adev->audio_endpt_idx_lock);
3625         spin_lock_init(&adev->mm_stats.lock);
3626
3627         INIT_LIST_HEAD(&adev->shadow_list);
3628         mutex_init(&adev->shadow_list_lock);
3629
3630         INIT_LIST_HEAD(&adev->reset_list);
3631
3632         INIT_LIST_HEAD(&adev->ras_list);
3633
3634         INIT_DELAYED_WORK(&adev->delayed_init_work,
3635                           amdgpu_device_delayed_init_work_handler);
3636         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3637                           amdgpu_device_delay_enable_gfx_off);
3638
3639         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3640
3641         adev->gfx.gfx_off_req_count = 1;
3642         adev->gfx.gfx_off_residency = 0;
3643         adev->gfx.gfx_off_entrycount = 0;
3644         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3645
3646         atomic_set(&adev->throttling_logging_enabled, 1);
3647         /*
3648          * If throttling continues, logging will be performed every minute
3649          * to avoid log flooding. "-1" is subtracted since the thermal
3650          * throttling interrupt comes every second. Thus, the total logging
3651          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3652          * for throttling interrupt) = 60 seconds.
3653          */
3654         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3655         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3656
3657         /* Registers mapping */
3658         /* TODO: block userspace mapping of io register */
3659         if (adev->asic_type >= CHIP_BONAIRE) {
3660                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3661                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3662         } else {
3663                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3664                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3665         }
3666
3667         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3668                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3669
3670         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3671         if (adev->rmmio == NULL) {
3672                 return -ENOMEM;
3673         }
3674         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3675         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3676
3677         amdgpu_device_get_pcie_info(adev);
3678
3679         if (amdgpu_mcbp)
3680                 DRM_INFO("MCBP is enabled\n");
3681
3682         /*
3683          * Reset domain needs to be present early, before XGMI hive discovered
3684          * (if any) and intitialized to use reset sem and in_gpu reset flag
3685          * early on during init and before calling to RREG32.
3686          */
3687         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3688         if (!adev->reset_domain)
3689                 return -ENOMEM;
3690
3691         /* detect hw virtualization here */
3692         amdgpu_detect_virtualization(adev);
3693
3694         r = amdgpu_device_get_job_timeout_settings(adev);
3695         if (r) {
3696                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3697                 return r;
3698         }
3699
3700         /* early init functions */
3701         r = amdgpu_device_ip_early_init(adev);
3702         if (r)
3703                 return r;
3704
3705         /* Get rid of things like offb */
3706         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3707         if (r)
3708                 return r;
3709
3710         /* Enable TMZ based on IP_VERSION */
3711         amdgpu_gmc_tmz_set(adev);
3712
3713         amdgpu_gmc_noretry_set(adev);
3714         /* Need to get xgmi info early to decide the reset behavior*/
3715         if (adev->gmc.xgmi.supported) {
3716                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3717                 if (r)
3718                         return r;
3719         }
3720
3721         /* enable PCIE atomic ops */
3722         if (amdgpu_sriov_vf(adev))
3723                 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3724                         adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3725                         (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3726         else
3727                 adev->have_atomics_support =
3728                         !pci_enable_atomic_ops_to_root(adev->pdev,
3729                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3730                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3731         if (!adev->have_atomics_support)
3732                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3733
3734         /* doorbell bar mapping and doorbell index init*/
3735         amdgpu_device_doorbell_init(adev);
3736
3737         if (amdgpu_emu_mode == 1) {
3738                 /* post the asic on emulation mode */
3739                 emu_soc_asic_init(adev);
3740                 goto fence_driver_init;
3741         }
3742
3743         amdgpu_reset_init(adev);
3744
3745         /* detect if we are with an SRIOV vbios */
3746         amdgpu_device_detect_sriov_bios(adev);
3747
3748         /* check if we need to reset the asic
3749          *  E.g., driver was not cleanly unloaded previously, etc.
3750          */
3751         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3752                 if (adev->gmc.xgmi.num_physical_nodes) {
3753                         dev_info(adev->dev, "Pending hive reset.\n");
3754                         adev->gmc.xgmi.pending_reset = true;
3755                         /* Only need to init necessary block for SMU to handle the reset */
3756                         for (i = 0; i < adev->num_ip_blocks; i++) {
3757                                 if (!adev->ip_blocks[i].status.valid)
3758                                         continue;
3759                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3760                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3761                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3762                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3763                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3764                                                 adev->ip_blocks[i].version->funcs->name);
3765                                         adev->ip_blocks[i].status.hw = true;
3766                                 }
3767                         }
3768                 } else {
3769                         r = amdgpu_asic_reset(adev);
3770                         if (r) {
3771                                 dev_err(adev->dev, "asic reset on init failed\n");
3772                                 goto failed;
3773                         }
3774                 }
3775         }
3776
3777         /* Post card if necessary */
3778         if (amdgpu_device_need_post(adev)) {
3779                 if (!adev->bios) {
3780                         dev_err(adev->dev, "no vBIOS found\n");
3781                         r = -EINVAL;
3782                         goto failed;
3783                 }
3784                 DRM_INFO("GPU posting now...\n");
3785                 r = amdgpu_device_asic_init(adev);
3786                 if (r) {
3787                         dev_err(adev->dev, "gpu post error!\n");
3788                         goto failed;
3789                 }
3790         }
3791
3792         if (adev->is_atom_fw) {
3793                 /* Initialize clocks */
3794                 r = amdgpu_atomfirmware_get_clock_info(adev);
3795                 if (r) {
3796                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3797                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3798                         goto failed;
3799                 }
3800         } else {
3801                 /* Initialize clocks */
3802                 r = amdgpu_atombios_get_clock_info(adev);
3803                 if (r) {
3804                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3805                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3806                         goto failed;
3807                 }
3808                 /* init i2c buses */
3809                 if (!amdgpu_device_has_dc_support(adev))
3810                         amdgpu_atombios_i2c_init(adev);
3811         }
3812
3813 fence_driver_init:
3814         /* Fence driver */
3815         r = amdgpu_fence_driver_sw_init(adev);
3816         if (r) {
3817                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3818                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3819                 goto failed;
3820         }
3821
3822         /* init the mode config */
3823         drm_mode_config_init(adev_to_drm(adev));
3824
3825         r = amdgpu_device_ip_init(adev);
3826         if (r) {
3827                 /* failed in exclusive mode due to timeout */
3828                 if (amdgpu_sriov_vf(adev) &&
3829                     !amdgpu_sriov_runtime(adev) &&
3830                     amdgpu_virt_mmio_blocked(adev) &&
3831                     !amdgpu_virt_wait_reset(adev)) {
3832                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3833                         /* Don't send request since VF is inactive. */
3834                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3835                         adev->virt.ops = NULL;
3836                         r = -EAGAIN;
3837                         goto release_ras_con;
3838                 }
3839                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3840                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3841                 goto release_ras_con;
3842         }
3843
3844         amdgpu_fence_driver_hw_init(adev);
3845
3846         dev_info(adev->dev,
3847                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3848                         adev->gfx.config.max_shader_engines,
3849                         adev->gfx.config.max_sh_per_se,
3850                         adev->gfx.config.max_cu_per_sh,
3851                         adev->gfx.cu_info.number);
3852
3853         adev->accel_working = true;
3854
3855         amdgpu_vm_check_compute_bug(adev);
3856
3857         /* Initialize the buffer migration limit. */
3858         if (amdgpu_moverate >= 0)
3859                 max_MBps = amdgpu_moverate;
3860         else
3861                 max_MBps = 8; /* Allow 8 MB/s. */
3862         /* Get a log2 for easy divisions. */
3863         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3864
3865         r = amdgpu_pm_sysfs_init(adev);
3866         if (r)
3867                 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3868
3869         r = amdgpu_ucode_sysfs_init(adev);
3870         if (r) {
3871                 adev->ucode_sysfs_en = false;
3872                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3873         } else
3874                 adev->ucode_sysfs_en = true;
3875
3876         r = amdgpu_psp_sysfs_init(adev);
3877         if (r) {
3878                 adev->psp_sysfs_en = false;
3879                 if (!amdgpu_sriov_vf(adev))
3880                         DRM_ERROR("Creating psp sysfs failed\n");
3881         } else
3882                 adev->psp_sysfs_en = true;
3883
3884         /*
3885          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3886          * Otherwise the mgpu fan boost feature will be skipped due to the
3887          * gpu instance is counted less.
3888          */
3889         amdgpu_register_gpu_instance(adev);
3890
3891         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3892          * explicit gating rather than handling it automatically.
3893          */
3894         if (!adev->gmc.xgmi.pending_reset) {
3895                 r = amdgpu_device_ip_late_init(adev);
3896                 if (r) {
3897                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3898                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3899                         goto release_ras_con;
3900                 }
3901                 /* must succeed. */
3902                 amdgpu_ras_resume(adev);
3903                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3904                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3905         }
3906
3907         if (amdgpu_sriov_vf(adev))
3908                 flush_delayed_work(&adev->delayed_init_work);
3909
3910         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3911         if (r)
3912                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3913
3914         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3915                 r = amdgpu_pmu_init(adev);
3916         if (r)
3917                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3918
3919         /* Have stored pci confspace at hand for restore in sudden PCI error */
3920         if (amdgpu_device_cache_pci_state(adev->pdev))
3921                 pci_restore_state(pdev);
3922
3923         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3924         /* this will fail for cards that aren't VGA class devices, just
3925          * ignore it */
3926         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3927                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3928
3929         px = amdgpu_device_supports_px(ddev);
3930
3931         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3932                                 apple_gmux_detect(NULL, NULL)))
3933                 vga_switcheroo_register_client(adev->pdev,
3934                                                &amdgpu_switcheroo_ops, px);
3935
3936         if (px)
3937                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3938
3939         if (adev->gmc.xgmi.pending_reset)
3940                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3941                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3942
3943         amdgpu_device_check_iommu_direct_map(adev);
3944
3945         return 0;
3946
3947 release_ras_con:
3948         amdgpu_release_ras_context(adev);
3949
3950 failed:
3951         amdgpu_vf_error_trans_all(adev);
3952
3953         return r;
3954 }
3955
3956 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3957 {
3958
3959         /* Clear all CPU mappings pointing to this device */
3960         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3961
3962         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3963         amdgpu_device_doorbell_fini(adev);
3964
3965         iounmap(adev->rmmio);
3966         adev->rmmio = NULL;
3967         if (adev->mman.aper_base_kaddr)
3968                 iounmap(adev->mman.aper_base_kaddr);
3969         adev->mman.aper_base_kaddr = NULL;
3970
3971         /* Memory manager related */
3972         if (!adev->gmc.xgmi.connected_to_cpu) {
3973                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3974                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3975         }
3976 }
3977
3978 /**
3979  * amdgpu_device_fini_hw - tear down the driver
3980  *
3981  * @adev: amdgpu_device pointer
3982  *
3983  * Tear down the driver info (all asics).
3984  * Called at driver shutdown.
3985  */
3986 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3987 {
3988         dev_info(adev->dev, "amdgpu: finishing device.\n");
3989         flush_delayed_work(&adev->delayed_init_work);
3990         adev->shutdown = true;
3991
3992         /* make sure IB test finished before entering exclusive mode
3993          * to avoid preemption on IB test
3994          * */
3995         if (amdgpu_sriov_vf(adev)) {
3996                 amdgpu_virt_request_full_gpu(adev, false);
3997                 amdgpu_virt_fini_data_exchange(adev);
3998         }
3999
4000         /* disable all interrupts */
4001         amdgpu_irq_disable_all(adev);
4002         if (adev->mode_info.mode_config_initialized){
4003                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4004                         drm_helper_force_disable_all(adev_to_drm(adev));
4005                 else
4006                         drm_atomic_helper_shutdown(adev_to_drm(adev));
4007         }
4008         amdgpu_fence_driver_hw_fini(adev);
4009
4010         if (adev->mman.initialized)
4011                 drain_workqueue(adev->mman.bdev.wq);
4012
4013         if (adev->pm.sysfs_initialized)
4014                 amdgpu_pm_sysfs_fini(adev);
4015         if (adev->ucode_sysfs_en)
4016                 amdgpu_ucode_sysfs_fini(adev);
4017         if (adev->psp_sysfs_en)
4018                 amdgpu_psp_sysfs_fini(adev);
4019         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4020
4021         /* disable ras feature must before hw fini */
4022         amdgpu_ras_pre_fini(adev);
4023
4024         amdgpu_device_ip_fini_early(adev);
4025
4026         amdgpu_irq_fini_hw(adev);
4027
4028         if (adev->mman.initialized)
4029                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4030
4031         amdgpu_gart_dummy_page_fini(adev);
4032
4033         if (drm_dev_is_unplugged(adev_to_drm(adev)))
4034                 amdgpu_device_unmap_mmio(adev);
4035
4036 }
4037
4038 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4039 {
4040         int idx;
4041         bool px;
4042
4043         amdgpu_fence_driver_sw_fini(adev);
4044         amdgpu_device_ip_fini(adev);
4045         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4046         adev->accel_working = false;
4047         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4048
4049         amdgpu_reset_fini(adev);
4050
4051         /* free i2c buses */
4052         if (!amdgpu_device_has_dc_support(adev))
4053                 amdgpu_i2c_fini(adev);
4054
4055         if (amdgpu_emu_mode != 1)
4056                 amdgpu_atombios_fini(adev);
4057
4058         kfree(adev->bios);
4059         adev->bios = NULL;
4060
4061         px = amdgpu_device_supports_px(adev_to_drm(adev));
4062
4063         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4064                                 apple_gmux_detect(NULL, NULL)))
4065                 vga_switcheroo_unregister_client(adev->pdev);
4066
4067         if (px)
4068                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4069
4070         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4071                 vga_client_unregister(adev->pdev);
4072
4073         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4074
4075                 iounmap(adev->rmmio);
4076                 adev->rmmio = NULL;
4077                 amdgpu_device_doorbell_fini(adev);
4078                 drm_dev_exit(idx);
4079         }
4080
4081         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4082                 amdgpu_pmu_fini(adev);
4083         if (adev->mman.discovery_bin)
4084                 amdgpu_discovery_fini(adev);
4085
4086         amdgpu_reset_put_reset_domain(adev->reset_domain);
4087         adev->reset_domain = NULL;
4088
4089         kfree(adev->pci_state);
4090
4091 }
4092
4093 /**
4094  * amdgpu_device_evict_resources - evict device resources
4095  * @adev: amdgpu device object
4096  *
4097  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4098  * of the vram memory type. Mainly used for evicting device resources
4099  * at suspend time.
4100  *
4101  */
4102 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4103 {
4104         int ret;
4105
4106         /* No need to evict vram on APUs for suspend to ram or s2idle */
4107         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4108                 return 0;
4109
4110         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4111         if (ret)
4112                 DRM_WARN("evicting device resources failed\n");
4113         return ret;
4114 }
4115
4116 /*
4117  * Suspend & resume.
4118  */
4119 /**
4120  * amdgpu_device_suspend - initiate device suspend
4121  *
4122  * @dev: drm dev pointer
4123  * @fbcon : notify the fbdev of suspend
4124  *
4125  * Puts the hw in the suspend state (all asics).
4126  * Returns 0 for success or an error on failure.
4127  * Called at driver suspend.
4128  */
4129 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4130 {
4131         struct amdgpu_device *adev = drm_to_adev(dev);
4132         int r = 0;
4133
4134         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4135                 return 0;
4136
4137         adev->in_suspend = true;
4138
4139         /* Evict the majority of BOs before grabbing the full access */
4140         r = amdgpu_device_evict_resources(adev);
4141         if (r)
4142                 return r;
4143
4144         if (amdgpu_sriov_vf(adev)) {
4145                 amdgpu_virt_fini_data_exchange(adev);
4146                 r = amdgpu_virt_request_full_gpu(adev, false);
4147                 if (r)
4148                         return r;
4149         }
4150
4151         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4152                 DRM_WARN("smart shift update failed\n");
4153
4154         drm_kms_helper_poll_disable(dev);
4155
4156         if (fbcon)
4157                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4158
4159         cancel_delayed_work_sync(&adev->delayed_init_work);
4160
4161         amdgpu_ras_suspend(adev);
4162
4163         amdgpu_device_ip_suspend_phase1(adev);
4164
4165         if (!adev->in_s0ix)
4166                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4167
4168         r = amdgpu_device_evict_resources(adev);
4169         if (r)
4170                 return r;
4171
4172         amdgpu_fence_driver_hw_fini(adev);
4173
4174         amdgpu_device_ip_suspend_phase2(adev);
4175
4176         if (amdgpu_sriov_vf(adev))
4177                 amdgpu_virt_release_full_gpu(adev, false);
4178
4179         return 0;
4180 }
4181
4182 /**
4183  * amdgpu_device_resume - initiate device resume
4184  *
4185  * @dev: drm dev pointer
4186  * @fbcon : notify the fbdev of resume
4187  *
4188  * Bring the hw back to operating state (all asics).
4189  * Returns 0 for success or an error on failure.
4190  * Called at driver resume.
4191  */
4192 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4193 {
4194         struct amdgpu_device *adev = drm_to_adev(dev);
4195         int r = 0;
4196
4197         if (amdgpu_sriov_vf(adev)) {
4198                 r = amdgpu_virt_request_full_gpu(adev, true);
4199                 if (r)
4200                         return r;
4201         }
4202
4203         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4204                 return 0;
4205
4206         if (adev->in_s0ix)
4207                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4208
4209         /* post card */
4210         if (amdgpu_device_need_post(adev)) {
4211                 r = amdgpu_device_asic_init(adev);
4212                 if (r)
4213                         dev_err(adev->dev, "amdgpu asic init failed\n");
4214         }
4215
4216         r = amdgpu_device_ip_resume(adev);
4217
4218         if (r) {
4219                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4220                 goto exit;
4221         }
4222         amdgpu_fence_driver_hw_init(adev);
4223
4224         r = amdgpu_device_ip_late_init(adev);
4225         if (r)
4226                 goto exit;
4227
4228         queue_delayed_work(system_wq, &adev->delayed_init_work,
4229                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4230
4231         if (!adev->in_s0ix) {
4232                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4233                 if (r)
4234                         goto exit;
4235         }
4236
4237 exit:
4238         if (amdgpu_sriov_vf(adev)) {
4239                 amdgpu_virt_init_data_exchange(adev);
4240                 amdgpu_virt_release_full_gpu(adev, true);
4241         }
4242
4243         if (r)
4244                 return r;
4245
4246         /* Make sure IB tests flushed */
4247         flush_delayed_work(&adev->delayed_init_work);
4248
4249         if (fbcon)
4250                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4251
4252         drm_kms_helper_poll_enable(dev);
4253
4254         amdgpu_ras_resume(adev);
4255
4256         if (adev->mode_info.num_crtc) {
4257                 /*
4258                  * Most of the connector probing functions try to acquire runtime pm
4259                  * refs to ensure that the GPU is powered on when connector polling is
4260                  * performed. Since we're calling this from a runtime PM callback,
4261                  * trying to acquire rpm refs will cause us to deadlock.
4262                  *
4263                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4264                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4265                  */
4266 #ifdef CONFIG_PM
4267                 dev->dev->power.disable_depth++;
4268 #endif
4269                 if (!adev->dc_enabled)
4270                         drm_helper_hpd_irq_event(dev);
4271                 else
4272                         drm_kms_helper_hotplug_event(dev);
4273 #ifdef CONFIG_PM
4274                 dev->dev->power.disable_depth--;
4275 #endif
4276         }
4277         adev->in_suspend = false;
4278
4279         if (adev->enable_mes)
4280                 amdgpu_mes_self_test(adev);
4281
4282         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4283                 DRM_WARN("smart shift update failed\n");
4284
4285         return 0;
4286 }
4287
4288 /**
4289  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4290  *
4291  * @adev: amdgpu_device pointer
4292  *
4293  * The list of all the hardware IPs that make up the asic is walked and
4294  * the check_soft_reset callbacks are run.  check_soft_reset determines
4295  * if the asic is still hung or not.
4296  * Returns true if any of the IPs are still in a hung state, false if not.
4297  */
4298 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4299 {
4300         int i;
4301         bool asic_hang = false;
4302
4303         if (amdgpu_sriov_vf(adev))
4304                 return true;
4305
4306         if (amdgpu_asic_need_full_reset(adev))
4307                 return true;
4308
4309         for (i = 0; i < adev->num_ip_blocks; i++) {
4310                 if (!adev->ip_blocks[i].status.valid)
4311                         continue;
4312                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4313                         adev->ip_blocks[i].status.hang =
4314                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4315                 if (adev->ip_blocks[i].status.hang) {
4316                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4317                         asic_hang = true;
4318                 }
4319         }
4320         return asic_hang;
4321 }
4322
4323 /**
4324  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4325  *
4326  * @adev: amdgpu_device pointer
4327  *
4328  * The list of all the hardware IPs that make up the asic is walked and the
4329  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4330  * handles any IP specific hardware or software state changes that are
4331  * necessary for a soft reset to succeed.
4332  * Returns 0 on success, negative error code on failure.
4333  */
4334 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4335 {
4336         int i, r = 0;
4337
4338         for (i = 0; i < adev->num_ip_blocks; i++) {
4339                 if (!adev->ip_blocks[i].status.valid)
4340                         continue;
4341                 if (adev->ip_blocks[i].status.hang &&
4342                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4343                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4344                         if (r)
4345                                 return r;
4346                 }
4347         }
4348
4349         return 0;
4350 }
4351
4352 /**
4353  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4354  *
4355  * @adev: amdgpu_device pointer
4356  *
4357  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4358  * reset is necessary to recover.
4359  * Returns true if a full asic reset is required, false if not.
4360  */
4361 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4362 {
4363         int i;
4364
4365         if (amdgpu_asic_need_full_reset(adev))
4366                 return true;
4367
4368         for (i = 0; i < adev->num_ip_blocks; i++) {
4369                 if (!adev->ip_blocks[i].status.valid)
4370                         continue;
4371                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4372                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4373                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4374                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4375                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4376                         if (adev->ip_blocks[i].status.hang) {
4377                                 dev_info(adev->dev, "Some block need full reset!\n");
4378                                 return true;
4379                         }
4380                 }
4381         }
4382         return false;
4383 }
4384
4385 /**
4386  * amdgpu_device_ip_soft_reset - do a soft reset
4387  *
4388  * @adev: amdgpu_device pointer
4389  *
4390  * The list of all the hardware IPs that make up the asic is walked and the
4391  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4392  * IP specific hardware or software state changes that are necessary to soft
4393  * reset the IP.
4394  * Returns 0 on success, negative error code on failure.
4395  */
4396 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4397 {
4398         int i, r = 0;
4399
4400         for (i = 0; i < adev->num_ip_blocks; i++) {
4401                 if (!adev->ip_blocks[i].status.valid)
4402                         continue;
4403                 if (adev->ip_blocks[i].status.hang &&
4404                     adev->ip_blocks[i].version->funcs->soft_reset) {
4405                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4406                         if (r)
4407                                 return r;
4408                 }
4409         }
4410
4411         return 0;
4412 }
4413
4414 /**
4415  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4416  *
4417  * @adev: amdgpu_device pointer
4418  *
4419  * The list of all the hardware IPs that make up the asic is walked and the
4420  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4421  * handles any IP specific hardware or software state changes that are
4422  * necessary after the IP has been soft reset.
4423  * Returns 0 on success, negative error code on failure.
4424  */
4425 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4426 {
4427         int i, r = 0;
4428
4429         for (i = 0; i < adev->num_ip_blocks; i++) {
4430                 if (!adev->ip_blocks[i].status.valid)
4431                         continue;
4432                 if (adev->ip_blocks[i].status.hang &&
4433                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4434                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4435                 if (r)
4436                         return r;
4437         }
4438
4439         return 0;
4440 }
4441
4442 /**
4443  * amdgpu_device_recover_vram - Recover some VRAM contents
4444  *
4445  * @adev: amdgpu_device pointer
4446  *
4447  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4448  * restore things like GPUVM page tables after a GPU reset where
4449  * the contents of VRAM might be lost.
4450  *
4451  * Returns:
4452  * 0 on success, negative error code on failure.
4453  */
4454 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4455 {
4456         struct dma_fence *fence = NULL, *next = NULL;
4457         struct amdgpu_bo *shadow;
4458         struct amdgpu_bo_vm *vmbo;
4459         long r = 1, tmo;
4460
4461         if (amdgpu_sriov_runtime(adev))
4462                 tmo = msecs_to_jiffies(8000);
4463         else
4464                 tmo = msecs_to_jiffies(100);
4465
4466         dev_info(adev->dev, "recover vram bo from shadow start\n");
4467         mutex_lock(&adev->shadow_list_lock);
4468         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4469                 shadow = &vmbo->bo;
4470                 /* No need to recover an evicted BO */
4471                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4472                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4473                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4474                         continue;
4475
4476                 r = amdgpu_bo_restore_shadow(shadow, &next);
4477                 if (r)
4478                         break;
4479
4480                 if (fence) {
4481                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4482                         dma_fence_put(fence);
4483                         fence = next;
4484                         if (tmo == 0) {
4485                                 r = -ETIMEDOUT;
4486                                 break;
4487                         } else if (tmo < 0) {
4488                                 r = tmo;
4489                                 break;
4490                         }
4491                 } else {
4492                         fence = next;
4493                 }
4494         }
4495         mutex_unlock(&adev->shadow_list_lock);
4496
4497         if (fence)
4498                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4499         dma_fence_put(fence);
4500
4501         if (r < 0 || tmo <= 0) {
4502                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4503                 return -EIO;
4504         }
4505
4506         dev_info(adev->dev, "recover vram bo from shadow done\n");
4507         return 0;
4508 }
4509
4510
4511 /**
4512  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4513  *
4514  * @adev: amdgpu_device pointer
4515  * @from_hypervisor: request from hypervisor
4516  *
4517  * do VF FLR and reinitialize Asic
4518  * return 0 means succeeded otherwise failed
4519  */
4520 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4521                                      bool from_hypervisor)
4522 {
4523         int r;
4524         struct amdgpu_hive_info *hive = NULL;
4525         int retry_limit = 0;
4526
4527 retry:
4528         amdgpu_amdkfd_pre_reset(adev);
4529
4530         if (from_hypervisor)
4531                 r = amdgpu_virt_request_full_gpu(adev, true);
4532         else
4533                 r = amdgpu_virt_reset_gpu(adev);
4534         if (r)
4535                 return r;
4536
4537         /* Resume IP prior to SMC */
4538         r = amdgpu_device_ip_reinit_early_sriov(adev);
4539         if (r)
4540                 goto error;
4541
4542         amdgpu_virt_init_data_exchange(adev);
4543
4544         r = amdgpu_device_fw_loading(adev);
4545         if (r)
4546                 return r;
4547
4548         /* now we are okay to resume SMC/CP/SDMA */
4549         r = amdgpu_device_ip_reinit_late_sriov(adev);
4550         if (r)
4551                 goto error;
4552
4553         hive = amdgpu_get_xgmi_hive(adev);
4554         /* Update PSP FW topology after reset */
4555         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4556                 r = amdgpu_xgmi_update_topology(hive, adev);
4557
4558         if (hive)
4559                 amdgpu_put_xgmi_hive(hive);
4560
4561         if (!r) {
4562                 amdgpu_irq_gpu_reset_resume_helper(adev);
4563                 r = amdgpu_ib_ring_tests(adev);
4564
4565                 amdgpu_amdkfd_post_reset(adev);
4566         }
4567
4568 error:
4569         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4570                 amdgpu_inc_vram_lost(adev);
4571                 r = amdgpu_device_recover_vram(adev);
4572         }
4573         amdgpu_virt_release_full_gpu(adev, true);
4574
4575         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4576                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4577                         retry_limit++;
4578                         goto retry;
4579                 } else
4580                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4581         }
4582
4583         return r;
4584 }
4585
4586 /**
4587  * amdgpu_device_has_job_running - check if there is any job in mirror list
4588  *
4589  * @adev: amdgpu_device pointer
4590  *
4591  * check if there is any job in mirror list
4592  */
4593 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4594 {
4595         int i;
4596         struct drm_sched_job *job;
4597
4598         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4599                 struct amdgpu_ring *ring = adev->rings[i];
4600
4601                 if (!ring || !ring->sched.thread)
4602                         continue;
4603
4604                 spin_lock(&ring->sched.job_list_lock);
4605                 job = list_first_entry_or_null(&ring->sched.pending_list,
4606                                                struct drm_sched_job, list);
4607                 spin_unlock(&ring->sched.job_list_lock);
4608                 if (job)
4609                         return true;
4610         }
4611         return false;
4612 }
4613
4614 /**
4615  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4616  *
4617  * @adev: amdgpu_device pointer
4618  *
4619  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4620  * a hung GPU.
4621  */
4622 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4623 {
4624
4625         if (amdgpu_gpu_recovery == 0)
4626                 goto disabled;
4627
4628         /* Skip soft reset check in fatal error mode */
4629         if (!amdgpu_ras_is_poison_mode_supported(adev))
4630                 return true;
4631
4632         if (amdgpu_sriov_vf(adev))
4633                 return true;
4634
4635         if (amdgpu_gpu_recovery == -1) {
4636                 switch (adev->asic_type) {
4637 #ifdef CONFIG_DRM_AMDGPU_SI
4638                 case CHIP_VERDE:
4639                 case CHIP_TAHITI:
4640                 case CHIP_PITCAIRN:
4641                 case CHIP_OLAND:
4642                 case CHIP_HAINAN:
4643 #endif
4644 #ifdef CONFIG_DRM_AMDGPU_CIK
4645                 case CHIP_KAVERI:
4646                 case CHIP_KABINI:
4647                 case CHIP_MULLINS:
4648 #endif
4649                 case CHIP_CARRIZO:
4650                 case CHIP_STONEY:
4651                 case CHIP_CYAN_SKILLFISH:
4652                         goto disabled;
4653                 default:
4654                         break;
4655                 }
4656         }
4657
4658         return true;
4659
4660 disabled:
4661                 dev_info(adev->dev, "GPU recovery disabled.\n");
4662                 return false;
4663 }
4664
4665 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4666 {
4667         u32 i;
4668         int ret = 0;
4669
4670         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4671
4672         dev_info(adev->dev, "GPU mode1 reset\n");
4673
4674         /* disable BM */
4675         pci_clear_master(adev->pdev);
4676
4677         amdgpu_device_cache_pci_state(adev->pdev);
4678
4679         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4680                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4681                 ret = amdgpu_dpm_mode1_reset(adev);
4682         } else {
4683                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4684                 ret = psp_gpu_reset(adev);
4685         }
4686
4687         if (ret)
4688                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4689
4690         amdgpu_device_load_pci_state(adev->pdev);
4691
4692         /* wait for asic to come out of reset */
4693         for (i = 0; i < adev->usec_timeout; i++) {
4694                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4695
4696                 if (memsize != 0xffffffff)
4697                         break;
4698                 udelay(1);
4699         }
4700
4701         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4702         return ret;
4703 }
4704
4705 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4706                                  struct amdgpu_reset_context *reset_context)
4707 {
4708         int i, r = 0;
4709         struct amdgpu_job *job = NULL;
4710         bool need_full_reset =
4711                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4712
4713         if (reset_context->reset_req_dev == adev)
4714                 job = reset_context->job;
4715
4716         if (amdgpu_sriov_vf(adev)) {
4717                 /* stop the data exchange thread */
4718                 amdgpu_virt_fini_data_exchange(adev);
4719         }
4720
4721         amdgpu_fence_driver_isr_toggle(adev, true);
4722
4723         /* block all schedulers and reset given job's ring */
4724         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4725                 struct amdgpu_ring *ring = adev->rings[i];
4726
4727                 if (!ring || !ring->sched.thread)
4728                         continue;
4729
4730                 /*clear job fence from fence drv to avoid force_completion
4731                  *leave NULL and vm flush fence in fence drv */
4732                 amdgpu_fence_driver_clear_job_fences(ring);
4733
4734                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4735                 amdgpu_fence_driver_force_completion(ring);
4736         }
4737
4738         amdgpu_fence_driver_isr_toggle(adev, false);
4739
4740         if (job && job->vm)
4741                 drm_sched_increase_karma(&job->base);
4742
4743         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4744         /* If reset handler not implemented, continue; otherwise return */
4745         if (r == -ENOSYS)
4746                 r = 0;
4747         else
4748                 return r;
4749
4750         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4751         if (!amdgpu_sriov_vf(adev)) {
4752
4753                 if (!need_full_reset)
4754                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4755
4756                 if (!need_full_reset && amdgpu_gpu_recovery &&
4757                     amdgpu_device_ip_check_soft_reset(adev)) {
4758                         amdgpu_device_ip_pre_soft_reset(adev);
4759                         r = amdgpu_device_ip_soft_reset(adev);
4760                         amdgpu_device_ip_post_soft_reset(adev);
4761                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4762                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4763                                 need_full_reset = true;
4764                         }
4765                 }
4766
4767                 if (need_full_reset)
4768                         r = amdgpu_device_ip_suspend(adev);
4769                 if (need_full_reset)
4770                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4771                 else
4772                         clear_bit(AMDGPU_NEED_FULL_RESET,
4773                                   &reset_context->flags);
4774         }
4775
4776         return r;
4777 }
4778
4779 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4780 {
4781         int i;
4782
4783         lockdep_assert_held(&adev->reset_domain->sem);
4784
4785         for (i = 0; i < adev->num_regs; i++) {
4786                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4787                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4788                                              adev->reset_dump_reg_value[i]);
4789         }
4790
4791         return 0;
4792 }
4793
4794 #ifdef CONFIG_DEV_COREDUMP
4795 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4796                 size_t count, void *data, size_t datalen)
4797 {
4798         struct drm_printer p;
4799         struct amdgpu_device *adev = data;
4800         struct drm_print_iterator iter;
4801         int i;
4802
4803         iter.data = buffer;
4804         iter.offset = 0;
4805         iter.start = offset;
4806         iter.remain = count;
4807
4808         p = drm_coredump_printer(&iter);
4809
4810         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4811         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4812         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4813         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4814         if (adev->reset_task_info.pid)
4815                 drm_printf(&p, "process_name: %s PID: %d\n",
4816                            adev->reset_task_info.process_name,
4817                            adev->reset_task_info.pid);
4818
4819         if (adev->reset_vram_lost)
4820                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4821         if (adev->num_regs) {
4822                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4823
4824                 for (i = 0; i < adev->num_regs; i++)
4825                         drm_printf(&p, "0x%08x: 0x%08x\n",
4826                                    adev->reset_dump_reg_list[i],
4827                                    adev->reset_dump_reg_value[i]);
4828         }
4829
4830         return count - iter.remain;
4831 }
4832
4833 static void amdgpu_devcoredump_free(void *data)
4834 {
4835 }
4836
4837 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4838 {
4839         struct drm_device *dev = adev_to_drm(adev);
4840
4841         ktime_get_ts64(&adev->reset_time);
4842         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4843                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4844 }
4845 #endif
4846
4847 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4848                          struct amdgpu_reset_context *reset_context)
4849 {
4850         struct amdgpu_device *tmp_adev = NULL;
4851         bool need_full_reset, skip_hw_reset, vram_lost = false;
4852         int r = 0;
4853         bool gpu_reset_for_dev_remove = 0;
4854
4855         /* Try reset handler method first */
4856         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4857                                     reset_list);
4858         amdgpu_reset_reg_dumps(tmp_adev);
4859
4860         reset_context->reset_device_list = device_list_handle;
4861         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4862         /* If reset handler not implemented, continue; otherwise return */
4863         if (r == -ENOSYS)
4864                 r = 0;
4865         else
4866                 return r;
4867
4868         /* Reset handler not implemented, use the default method */
4869         need_full_reset =
4870                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4871         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4872
4873         gpu_reset_for_dev_remove =
4874                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4875                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4876
4877         /*
4878          * ASIC reset has to be done on all XGMI hive nodes ASAP
4879          * to allow proper links negotiation in FW (within 1 sec)
4880          */
4881         if (!skip_hw_reset && need_full_reset) {
4882                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4883                         /* For XGMI run all resets in parallel to speed up the process */
4884                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4885                                 tmp_adev->gmc.xgmi.pending_reset = false;
4886                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4887                                         r = -EALREADY;
4888                         } else
4889                                 r = amdgpu_asic_reset(tmp_adev);
4890
4891                         if (r) {
4892                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4893                                          r, adev_to_drm(tmp_adev)->unique);
4894                                 break;
4895                         }
4896                 }
4897
4898                 /* For XGMI wait for all resets to complete before proceed */
4899                 if (!r) {
4900                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4901                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4902                                         flush_work(&tmp_adev->xgmi_reset_work);
4903                                         r = tmp_adev->asic_reset_res;
4904                                         if (r)
4905                                                 break;
4906                                 }
4907                         }
4908                 }
4909         }
4910
4911         if (!r && amdgpu_ras_intr_triggered()) {
4912                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4913                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4914                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4915                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4916                 }
4917
4918                 amdgpu_ras_intr_cleared();
4919         }
4920
4921         /* Since the mode1 reset affects base ip blocks, the
4922          * phase1 ip blocks need to be resumed. Otherwise there
4923          * will be a BIOS signature error and the psp bootloader
4924          * can't load kdb on the next amdgpu install.
4925          */
4926         if (gpu_reset_for_dev_remove) {
4927                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4928                         amdgpu_device_ip_resume_phase1(tmp_adev);
4929
4930                 goto end;
4931         }
4932
4933         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4934                 if (need_full_reset) {
4935                         /* post card */
4936                         r = amdgpu_device_asic_init(tmp_adev);
4937                         if (r) {
4938                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4939                         } else {
4940                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4941                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4942                                 if (r)
4943                                         goto out;
4944
4945                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4946                                 if (r)
4947                                         goto out;
4948
4949                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4950 #ifdef CONFIG_DEV_COREDUMP
4951                                 tmp_adev->reset_vram_lost = vram_lost;
4952                                 memset(&tmp_adev->reset_task_info, 0,
4953                                                 sizeof(tmp_adev->reset_task_info));
4954                                 if (reset_context->job && reset_context->job->vm)
4955                                         tmp_adev->reset_task_info =
4956                                                 reset_context->job->vm->task_info;
4957                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4958 #endif
4959                                 if (vram_lost) {
4960                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4961                                         amdgpu_inc_vram_lost(tmp_adev);
4962                                 }
4963
4964                                 r = amdgpu_device_fw_loading(tmp_adev);
4965                                 if (r)
4966                                         return r;
4967
4968                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4969                                 if (r)
4970                                         goto out;
4971
4972                                 if (vram_lost)
4973                                         amdgpu_device_fill_reset_magic(tmp_adev);
4974
4975                                 /*
4976                                  * Add this ASIC as tracked as reset was already
4977                                  * complete successfully.
4978                                  */
4979                                 amdgpu_register_gpu_instance(tmp_adev);
4980
4981                                 if (!reset_context->hive &&
4982                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4983                                         amdgpu_xgmi_add_device(tmp_adev);
4984
4985                                 r = amdgpu_device_ip_late_init(tmp_adev);
4986                                 if (r)
4987                                         goto out;
4988
4989                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4990
4991                                 /*
4992                                  * The GPU enters bad state once faulty pages
4993                                  * by ECC has reached the threshold, and ras
4994                                  * recovery is scheduled next. So add one check
4995                                  * here to break recovery if it indeed exceeds
4996                                  * bad page threshold, and remind user to
4997                                  * retire this GPU or setting one bigger
4998                                  * bad_page_threshold value to fix this once
4999                                  * probing driver again.
5000                                  */
5001                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5002                                         /* must succeed. */
5003                                         amdgpu_ras_resume(tmp_adev);
5004                                 } else {
5005                                         r = -EINVAL;
5006                                         goto out;
5007                                 }
5008
5009                                 /* Update PSP FW topology after reset */
5010                                 if (reset_context->hive &&
5011                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5012                                         r = amdgpu_xgmi_update_topology(
5013                                                 reset_context->hive, tmp_adev);
5014                         }
5015                 }
5016
5017 out:
5018                 if (!r) {
5019                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5020                         r = amdgpu_ib_ring_tests(tmp_adev);
5021                         if (r) {
5022                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5023                                 need_full_reset = true;
5024                                 r = -EAGAIN;
5025                                 goto end;
5026                         }
5027                 }
5028
5029                 if (!r)
5030                         r = amdgpu_device_recover_vram(tmp_adev);
5031                 else
5032                         tmp_adev->asic_reset_res = r;
5033         }
5034
5035 end:
5036         if (need_full_reset)
5037                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5038         else
5039                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5040         return r;
5041 }
5042
5043 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5044 {
5045
5046         switch (amdgpu_asic_reset_method(adev)) {
5047         case AMD_RESET_METHOD_MODE1:
5048                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5049                 break;
5050         case AMD_RESET_METHOD_MODE2:
5051                 adev->mp1_state = PP_MP1_STATE_RESET;
5052                 break;
5053         default:
5054                 adev->mp1_state = PP_MP1_STATE_NONE;
5055                 break;
5056         }
5057 }
5058
5059 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5060 {
5061         amdgpu_vf_error_trans_all(adev);
5062         adev->mp1_state = PP_MP1_STATE_NONE;
5063 }
5064
5065 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5066 {
5067         struct pci_dev *p = NULL;
5068
5069         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5070                         adev->pdev->bus->number, 1);
5071         if (p) {
5072                 pm_runtime_enable(&(p->dev));
5073                 pm_runtime_resume(&(p->dev));
5074         }
5075
5076         pci_dev_put(p);
5077 }
5078
5079 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5080 {
5081         enum amd_reset_method reset_method;
5082         struct pci_dev *p = NULL;
5083         u64 expires;
5084
5085         /*
5086          * For now, only BACO and mode1 reset are confirmed
5087          * to suffer the audio issue without proper suspended.
5088          */
5089         reset_method = amdgpu_asic_reset_method(adev);
5090         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5091              (reset_method != AMD_RESET_METHOD_MODE1))
5092                 return -EINVAL;
5093
5094         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5095                         adev->pdev->bus->number, 1);
5096         if (!p)
5097                 return -ENODEV;
5098
5099         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5100         if (!expires)
5101                 /*
5102                  * If we cannot get the audio device autosuspend delay,
5103                  * a fixed 4S interval will be used. Considering 3S is
5104                  * the audio controller default autosuspend delay setting.
5105                  * 4S used here is guaranteed to cover that.
5106                  */
5107                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5108
5109         while (!pm_runtime_status_suspended(&(p->dev))) {
5110                 if (!pm_runtime_suspend(&(p->dev)))
5111                         break;
5112
5113                 if (expires < ktime_get_mono_fast_ns()) {
5114                         dev_warn(adev->dev, "failed to suspend display audio\n");
5115                         pci_dev_put(p);
5116                         /* TODO: abort the succeeding gpu reset? */
5117                         return -ETIMEDOUT;
5118                 }
5119         }
5120
5121         pm_runtime_disable(&(p->dev));
5122
5123         pci_dev_put(p);
5124         return 0;
5125 }
5126
5127 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5128 {
5129         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5130
5131 #if defined(CONFIG_DEBUG_FS)
5132         if (!amdgpu_sriov_vf(adev))
5133                 cancel_work(&adev->reset_work);
5134 #endif
5135
5136         if (adev->kfd.dev)
5137                 cancel_work(&adev->kfd.reset_work);
5138
5139         if (amdgpu_sriov_vf(adev))
5140                 cancel_work(&adev->virt.flr_work);
5141
5142         if (con && adev->ras_enabled)
5143                 cancel_work(&con->recovery_work);
5144
5145 }
5146
5147 /**
5148  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5149  *
5150  * @adev: amdgpu_device pointer
5151  * @job: which job trigger hang
5152  *
5153  * Attempt to reset the GPU if it has hung (all asics).
5154  * Attempt to do soft-reset or full-reset and reinitialize Asic
5155  * Returns 0 for success or an error on failure.
5156  */
5157
5158 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5159                               struct amdgpu_job *job,
5160                               struct amdgpu_reset_context *reset_context)
5161 {
5162         struct list_head device_list, *device_list_handle =  NULL;
5163         bool job_signaled = false;
5164         struct amdgpu_hive_info *hive = NULL;
5165         struct amdgpu_device *tmp_adev = NULL;
5166         int i, r = 0;
5167         bool need_emergency_restart = false;
5168         bool audio_suspended = false;
5169         bool gpu_reset_for_dev_remove = false;
5170
5171         gpu_reset_for_dev_remove =
5172                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5173                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5174
5175         /*
5176          * Special case: RAS triggered and full reset isn't supported
5177          */
5178         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5179
5180         /*
5181          * Flush RAM to disk so that after reboot
5182          * the user can read log and see why the system rebooted.
5183          */
5184         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5185                 DRM_WARN("Emergency reboot.");
5186
5187                 ksys_sync_helper();
5188                 emergency_restart();
5189         }
5190
5191         dev_info(adev->dev, "GPU %s begin!\n",
5192                 need_emergency_restart ? "jobs stop":"reset");
5193
5194         if (!amdgpu_sriov_vf(adev))
5195                 hive = amdgpu_get_xgmi_hive(adev);
5196         if (hive)
5197                 mutex_lock(&hive->hive_lock);
5198
5199         reset_context->job = job;
5200         reset_context->hive = hive;
5201         /*
5202          * Build list of devices to reset.
5203          * In case we are in XGMI hive mode, resort the device list
5204          * to put adev in the 1st position.
5205          */
5206         INIT_LIST_HEAD(&device_list);
5207         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5208                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5209                         list_add_tail(&tmp_adev->reset_list, &device_list);
5210                         if (gpu_reset_for_dev_remove && adev->shutdown)
5211                                 tmp_adev->shutdown = true;
5212                 }
5213                 if (!list_is_first(&adev->reset_list, &device_list))
5214                         list_rotate_to_front(&adev->reset_list, &device_list);
5215                 device_list_handle = &device_list;
5216         } else {
5217                 list_add_tail(&adev->reset_list, &device_list);
5218                 device_list_handle = &device_list;
5219         }
5220
5221         /* We need to lock reset domain only once both for XGMI and single device */
5222         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5223                                     reset_list);
5224         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5225
5226         /* block all schedulers and reset given job's ring */
5227         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5228
5229                 amdgpu_device_set_mp1_state(tmp_adev);
5230
5231                 /*
5232                  * Try to put the audio codec into suspend state
5233                  * before gpu reset started.
5234                  *
5235                  * Due to the power domain of the graphics device
5236                  * is shared with AZ power domain. Without this,
5237                  * we may change the audio hardware from behind
5238                  * the audio driver's back. That will trigger
5239                  * some audio codec errors.
5240                  */
5241                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5242                         audio_suspended = true;
5243
5244                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5245
5246                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5247
5248                 if (!amdgpu_sriov_vf(tmp_adev))
5249                         amdgpu_amdkfd_pre_reset(tmp_adev);
5250
5251                 /*
5252                  * Mark these ASICs to be reseted as untracked first
5253                  * And add them back after reset completed
5254                  */
5255                 amdgpu_unregister_gpu_instance(tmp_adev);
5256
5257                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5258
5259                 /* disable ras on ALL IPs */
5260                 if (!need_emergency_restart &&
5261                       amdgpu_device_ip_need_full_reset(tmp_adev))
5262                         amdgpu_ras_suspend(tmp_adev);
5263
5264                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5265                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5266
5267                         if (!ring || !ring->sched.thread)
5268                                 continue;
5269
5270                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5271
5272                         if (need_emergency_restart)
5273                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5274                 }
5275                 atomic_inc(&tmp_adev->gpu_reset_counter);
5276         }
5277
5278         if (need_emergency_restart)
5279                 goto skip_sched_resume;
5280
5281         /*
5282          * Must check guilty signal here since after this point all old
5283          * HW fences are force signaled.
5284          *
5285          * job->base holds a reference to parent fence
5286          */
5287         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5288                 job_signaled = true;
5289                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5290                 goto skip_hw_reset;
5291         }
5292
5293 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5294         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5295                 if (gpu_reset_for_dev_remove) {
5296                         /* Workaroud for ASICs need to disable SMC first */
5297                         amdgpu_device_smu_fini_early(tmp_adev);
5298                 }
5299                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5300                 /*TODO Should we stop ?*/
5301                 if (r) {
5302                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5303                                   r, adev_to_drm(tmp_adev)->unique);
5304                         tmp_adev->asic_reset_res = r;
5305                 }
5306
5307                 /*
5308                  * Drop all pending non scheduler resets. Scheduler resets
5309                  * were already dropped during drm_sched_stop
5310                  */
5311                 amdgpu_device_stop_pending_resets(tmp_adev);
5312         }
5313
5314         /* Actual ASIC resets if needed.*/
5315         /* Host driver will handle XGMI hive reset for SRIOV */
5316         if (amdgpu_sriov_vf(adev)) {
5317                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5318                 if (r)
5319                         adev->asic_reset_res = r;
5320
5321                 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5322                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5323                         amdgpu_ras_resume(adev);
5324         } else {
5325                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5326                 if (r && r == -EAGAIN)
5327                         goto retry;
5328
5329                 if (!r && gpu_reset_for_dev_remove)
5330                         goto recover_end;
5331         }
5332
5333 skip_hw_reset:
5334
5335         /* Post ASIC reset for all devs .*/
5336         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5337
5338                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5339                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5340
5341                         if (!ring || !ring->sched.thread)
5342                                 continue;
5343
5344                         drm_sched_start(&ring->sched, true);
5345                 }
5346
5347                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5348                         amdgpu_mes_self_test(tmp_adev);
5349
5350                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5351                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5352                 }
5353
5354                 if (tmp_adev->asic_reset_res)
5355                         r = tmp_adev->asic_reset_res;
5356
5357                 tmp_adev->asic_reset_res = 0;
5358
5359                 if (r) {
5360                         /* bad news, how to tell it to userspace ? */
5361                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5362                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5363                 } else {
5364                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5365                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5366                                 DRM_WARN("smart shift update failed\n");
5367                 }
5368         }
5369
5370 skip_sched_resume:
5371         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5372                 /* unlock kfd: SRIOV would do it separately */
5373                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5374                         amdgpu_amdkfd_post_reset(tmp_adev);
5375
5376                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5377                  * need to bring up kfd here if it's not be initialized before
5378                  */
5379                 if (!adev->kfd.init_complete)
5380                         amdgpu_amdkfd_device_init(adev);
5381
5382                 if (audio_suspended)
5383                         amdgpu_device_resume_display_audio(tmp_adev);
5384
5385                 amdgpu_device_unset_mp1_state(tmp_adev);
5386
5387                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5388         }
5389
5390 recover_end:
5391         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5392                                             reset_list);
5393         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5394
5395         if (hive) {
5396                 mutex_unlock(&hive->hive_lock);
5397                 amdgpu_put_xgmi_hive(hive);
5398         }
5399
5400         if (r)
5401                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5402
5403         atomic_set(&adev->reset_domain->reset_res, r);
5404         return r;
5405 }
5406
5407 /**
5408  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5409  *
5410  * @adev: amdgpu_device pointer
5411  *
5412  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5413  * and lanes) of the slot the device is in. Handles APUs and
5414  * virtualized environments where PCIE config space may not be available.
5415  */
5416 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5417 {
5418         struct pci_dev *pdev;
5419         enum pci_bus_speed speed_cap, platform_speed_cap;
5420         enum pcie_link_width platform_link_width;
5421
5422         if (amdgpu_pcie_gen_cap)
5423                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5424
5425         if (amdgpu_pcie_lane_cap)
5426                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5427
5428         /* covers APUs as well */
5429         if (pci_is_root_bus(adev->pdev->bus)) {
5430                 if (adev->pm.pcie_gen_mask == 0)
5431                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5432                 if (adev->pm.pcie_mlw_mask == 0)
5433                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5434                 return;
5435         }
5436
5437         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5438                 return;
5439
5440         pcie_bandwidth_available(adev->pdev, NULL,
5441                                  &platform_speed_cap, &platform_link_width);
5442
5443         if (adev->pm.pcie_gen_mask == 0) {
5444                 /* asic caps */
5445                 pdev = adev->pdev;
5446                 speed_cap = pcie_get_speed_cap(pdev);
5447                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5448                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5449                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5450                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5451                 } else {
5452                         if (speed_cap == PCIE_SPEED_32_0GT)
5453                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5454                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5455                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5456                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5457                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5458                         else if (speed_cap == PCIE_SPEED_16_0GT)
5459                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5460                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5461                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5462                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5463                         else if (speed_cap == PCIE_SPEED_8_0GT)
5464                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5465                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5466                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5467                         else if (speed_cap == PCIE_SPEED_5_0GT)
5468                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5469                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5470                         else
5471                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5472                 }
5473                 /* platform caps */
5474                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5475                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5476                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5477                 } else {
5478                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5479                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5480                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5481                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5482                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5483                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5484                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5485                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5486                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5487                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5488                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5489                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5490                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5491                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5492                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5493                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5494                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5495                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5496                         else
5497                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5498
5499                 }
5500         }
5501         if (adev->pm.pcie_mlw_mask == 0) {
5502                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5503                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5504                 } else {
5505                         switch (platform_link_width) {
5506                         case PCIE_LNK_X32:
5507                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5508                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5509                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5510                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5511                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5512                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5513                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5514                                 break;
5515                         case PCIE_LNK_X16:
5516                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5517                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5518                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5519                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5520                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5521                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5522                                 break;
5523                         case PCIE_LNK_X12:
5524                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5525                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5526                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5527                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5528                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5529                                 break;
5530                         case PCIE_LNK_X8:
5531                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5532                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5533                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5534                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5535                                 break;
5536                         case PCIE_LNK_X4:
5537                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5538                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5539                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5540                                 break;
5541                         case PCIE_LNK_X2:
5542                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5543                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5544                                 break;
5545                         case PCIE_LNK_X1:
5546                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5547                                 break;
5548                         default:
5549                                 break;
5550                         }
5551                 }
5552         }
5553 }
5554
5555 /**
5556  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5557  *
5558  * @adev: amdgpu_device pointer
5559  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5560  *
5561  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5562  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5563  * @peer_adev.
5564  */
5565 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5566                                       struct amdgpu_device *peer_adev)
5567 {
5568 #ifdef CONFIG_HSA_AMD_P2P
5569         uint64_t address_mask = peer_adev->dev->dma_mask ?
5570                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5571         resource_size_t aper_limit =
5572                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5573         bool p2p_access =
5574                 !adev->gmc.xgmi.connected_to_cpu &&
5575                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5576
5577         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5578                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5579                 !(adev->gmc.aper_base & address_mask ||
5580                   aper_limit & address_mask));
5581 #else
5582         return false;
5583 #endif
5584 }
5585
5586 int amdgpu_device_baco_enter(struct drm_device *dev)
5587 {
5588         struct amdgpu_device *adev = drm_to_adev(dev);
5589         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5590
5591         if (!amdgpu_device_supports_baco(dev))
5592                 return -ENOTSUPP;
5593
5594         if (ras && adev->ras_enabled &&
5595             adev->nbio.funcs->enable_doorbell_interrupt)
5596                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5597
5598         return amdgpu_dpm_baco_enter(adev);
5599 }
5600
5601 int amdgpu_device_baco_exit(struct drm_device *dev)
5602 {
5603         struct amdgpu_device *adev = drm_to_adev(dev);
5604         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5605         int ret = 0;
5606
5607         if (!amdgpu_device_supports_baco(dev))
5608                 return -ENOTSUPP;
5609
5610         ret = amdgpu_dpm_baco_exit(adev);
5611         if (ret)
5612                 return ret;
5613
5614         if (ras && adev->ras_enabled &&
5615             adev->nbio.funcs->enable_doorbell_interrupt)
5616                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5617
5618         if (amdgpu_passthrough(adev) &&
5619             adev->nbio.funcs->clear_doorbell_interrupt)
5620                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5621
5622         return 0;
5623 }
5624
5625 /**
5626  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5627  * @pdev: PCI device struct
5628  * @state: PCI channel state
5629  *
5630  * Description: Called when a PCI error is detected.
5631  *
5632  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5633  */
5634 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5635 {
5636         struct drm_device *dev = pci_get_drvdata(pdev);
5637         struct amdgpu_device *adev = drm_to_adev(dev);
5638         int i;
5639
5640         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5641
5642         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5643                 DRM_WARN("No support for XGMI hive yet...");
5644                 return PCI_ERS_RESULT_DISCONNECT;
5645         }
5646
5647         adev->pci_channel_state = state;
5648
5649         switch (state) {
5650         case pci_channel_io_normal:
5651                 return PCI_ERS_RESULT_CAN_RECOVER;
5652         /* Fatal error, prepare for slot reset */
5653         case pci_channel_io_frozen:
5654                 /*
5655                  * Locking adev->reset_domain->sem will prevent any external access
5656                  * to GPU during PCI error recovery
5657                  */
5658                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5659                 amdgpu_device_set_mp1_state(adev);
5660
5661                 /*
5662                  * Block any work scheduling as we do for regular GPU reset
5663                  * for the duration of the recovery
5664                  */
5665                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5666                         struct amdgpu_ring *ring = adev->rings[i];
5667
5668                         if (!ring || !ring->sched.thread)
5669                                 continue;
5670
5671                         drm_sched_stop(&ring->sched, NULL);
5672                 }
5673                 atomic_inc(&adev->gpu_reset_counter);
5674                 return PCI_ERS_RESULT_NEED_RESET;
5675         case pci_channel_io_perm_failure:
5676                 /* Permanent error, prepare for device removal */
5677                 return PCI_ERS_RESULT_DISCONNECT;
5678         }
5679
5680         return PCI_ERS_RESULT_NEED_RESET;
5681 }
5682
5683 /**
5684  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5685  * @pdev: pointer to PCI device
5686  */
5687 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5688 {
5689
5690         DRM_INFO("PCI error: mmio enabled callback!!\n");
5691
5692         /* TODO - dump whatever for debugging purposes */
5693
5694         /* This called only if amdgpu_pci_error_detected returns
5695          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5696          * works, no need to reset slot.
5697          */
5698
5699         return PCI_ERS_RESULT_RECOVERED;
5700 }
5701
5702 /**
5703  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5704  * @pdev: PCI device struct
5705  *
5706  * Description: This routine is called by the pci error recovery
5707  * code after the PCI slot has been reset, just before we
5708  * should resume normal operations.
5709  */
5710 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5711 {
5712         struct drm_device *dev = pci_get_drvdata(pdev);
5713         struct amdgpu_device *adev = drm_to_adev(dev);
5714         int r, i;
5715         struct amdgpu_reset_context reset_context;
5716         u32 memsize;
5717         struct list_head device_list;
5718
5719         DRM_INFO("PCI error: slot reset callback!!\n");
5720
5721         memset(&reset_context, 0, sizeof(reset_context));
5722
5723         INIT_LIST_HEAD(&device_list);
5724         list_add_tail(&adev->reset_list, &device_list);
5725
5726         /* wait for asic to come out of reset */
5727         msleep(500);
5728
5729         /* Restore PCI confspace */
5730         amdgpu_device_load_pci_state(pdev);
5731
5732         /* confirm  ASIC came out of reset */
5733         for (i = 0; i < adev->usec_timeout; i++) {
5734                 memsize = amdgpu_asic_get_config_memsize(adev);
5735
5736                 if (memsize != 0xffffffff)
5737                         break;
5738                 udelay(1);
5739         }
5740         if (memsize == 0xffffffff) {
5741                 r = -ETIME;
5742                 goto out;
5743         }
5744
5745         reset_context.method = AMD_RESET_METHOD_NONE;
5746         reset_context.reset_req_dev = adev;
5747         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5748         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5749
5750         adev->no_hw_access = true;
5751         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5752         adev->no_hw_access = false;
5753         if (r)
5754                 goto out;
5755
5756         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5757
5758 out:
5759         if (!r) {
5760                 if (amdgpu_device_cache_pci_state(adev->pdev))
5761                         pci_restore_state(adev->pdev);
5762
5763                 DRM_INFO("PCIe error recovery succeeded\n");
5764         } else {
5765                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5766                 amdgpu_device_unset_mp1_state(adev);
5767                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5768         }
5769
5770         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5771 }
5772
5773 /**
5774  * amdgpu_pci_resume() - resume normal ops after PCI reset
5775  * @pdev: pointer to PCI device
5776  *
5777  * Called when the error recovery driver tells us that its
5778  * OK to resume normal operation.
5779  */
5780 void amdgpu_pci_resume(struct pci_dev *pdev)
5781 {
5782         struct drm_device *dev = pci_get_drvdata(pdev);
5783         struct amdgpu_device *adev = drm_to_adev(dev);
5784         int i;
5785
5786
5787         DRM_INFO("PCI error: resume callback!!\n");
5788
5789         /* Only continue execution for the case of pci_channel_io_frozen */
5790         if (adev->pci_channel_state != pci_channel_io_frozen)
5791                 return;
5792
5793         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5794                 struct amdgpu_ring *ring = adev->rings[i];
5795
5796                 if (!ring || !ring->sched.thread)
5797                         continue;
5798
5799                 drm_sched_start(&ring->sched, true);
5800         }
5801
5802         amdgpu_device_unset_mp1_state(adev);
5803         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5804 }
5805
5806 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5807 {
5808         struct drm_device *dev = pci_get_drvdata(pdev);
5809         struct amdgpu_device *adev = drm_to_adev(dev);
5810         int r;
5811
5812         r = pci_save_state(pdev);
5813         if (!r) {
5814                 kfree(adev->pci_state);
5815
5816                 adev->pci_state = pci_store_saved_state(pdev);
5817
5818                 if (!adev->pci_state) {
5819                         DRM_ERROR("Failed to store PCI saved state");
5820                         return false;
5821                 }
5822         } else {
5823                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5824                 return false;
5825         }
5826
5827         return true;
5828 }
5829
5830 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5831 {
5832         struct drm_device *dev = pci_get_drvdata(pdev);
5833         struct amdgpu_device *adev = drm_to_adev(dev);
5834         int r;
5835
5836         if (!adev->pci_state)
5837                 return false;
5838
5839         r = pci_load_saved_state(pdev, adev->pci_state);
5840
5841         if (!r) {
5842                 pci_restore_state(pdev);
5843         } else {
5844                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5845                 return false;
5846         }
5847
5848         return true;
5849 }
5850
5851 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5852                 struct amdgpu_ring *ring)
5853 {
5854 #ifdef CONFIG_X86_64
5855         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5856                 return;
5857 #endif
5858         if (adev->gmc.xgmi.connected_to_cpu)
5859                 return;
5860
5861         if (ring && ring->funcs->emit_hdp_flush)
5862                 amdgpu_ring_emit_hdp_flush(ring);
5863         else
5864                 amdgpu_asic_flush_hdp(adev, ring);
5865 }
5866
5867 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5868                 struct amdgpu_ring *ring)
5869 {
5870 #ifdef CONFIG_X86_64
5871         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5872                 return;
5873 #endif
5874         if (adev->gmc.xgmi.connected_to_cpu)
5875                 return;
5876
5877         amdgpu_asic_invalidate_hdp(adev, ring);
5878 }
5879
5880 int amdgpu_in_reset(struct amdgpu_device *adev)
5881 {
5882         return atomic_read(&adev->reset_domain->in_gpu_reset);
5883 }
5884
5885 /**
5886  * amdgpu_device_halt() - bring hardware to some kind of halt state
5887  *
5888  * @adev: amdgpu_device pointer
5889  *
5890  * Bring hardware to some kind of halt state so that no one can touch it
5891  * any more. It will help to maintain error context when error occurred.
5892  * Compare to a simple hang, the system will keep stable at least for SSH
5893  * access. Then it should be trivial to inspect the hardware state and
5894  * see what's going on. Implemented as following:
5895  *
5896  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5897  *    clears all CPU mappings to device, disallows remappings through page faults
5898  * 2. amdgpu_irq_disable_all() disables all interrupts
5899  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5900  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5901  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5902  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5903  *    flush any in flight DMA operations
5904  */
5905 void amdgpu_device_halt(struct amdgpu_device *adev)
5906 {
5907         struct pci_dev *pdev = adev->pdev;
5908         struct drm_device *ddev = adev_to_drm(adev);
5909
5910         drm_dev_unplug(ddev);
5911
5912         amdgpu_irq_disable_all(adev);
5913
5914         amdgpu_fence_driver_hw_fini(adev);
5915
5916         adev->no_hw_access = true;
5917
5918         amdgpu_device_unmap_mmio(adev);
5919
5920         pci_disable_device(pdev);
5921         pci_wait_for_pending_transaction(pdev);
5922 }
5923
5924 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5925                                 u32 reg)
5926 {
5927         unsigned long flags, address, data;
5928         u32 r;
5929
5930         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5931         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5932
5933         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5934         WREG32(address, reg * 4);
5935         (void)RREG32(address);
5936         r = RREG32(data);
5937         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5938         return r;
5939 }
5940
5941 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5942                                 u32 reg, u32 v)
5943 {
5944         unsigned long flags, address, data;
5945
5946         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5947         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5948
5949         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5950         WREG32(address, reg * 4);
5951         (void)RREG32(address);
5952         WREG32(data, v);
5953         (void)RREG32(data);
5954         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5955 }
5956
5957 /**
5958  * amdgpu_device_switch_gang - switch to a new gang
5959  * @adev: amdgpu_device pointer
5960  * @gang: the gang to switch to
5961  *
5962  * Try to switch to a new gang.
5963  * Returns: NULL if we switched to the new gang or a reference to the current
5964  * gang leader.
5965  */
5966 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5967                                             struct dma_fence *gang)
5968 {
5969         struct dma_fence *old = NULL;
5970
5971         do {
5972                 dma_fence_put(old);
5973                 rcu_read_lock();
5974                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5975                 rcu_read_unlock();
5976
5977                 if (old == gang)
5978                         break;
5979
5980                 if (!dma_fence_is_signaled(old))
5981                         return old;
5982
5983         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5984                          old, gang) != old);
5985
5986         dma_fence_put(old);
5987         return NULL;
5988 }
5989
5990 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
5991 {
5992         switch (adev->asic_type) {
5993 #ifdef CONFIG_DRM_AMDGPU_SI
5994         case CHIP_HAINAN:
5995 #endif
5996         case CHIP_TOPAZ:
5997                 /* chips with no display hardware */
5998                 return false;
5999 #ifdef CONFIG_DRM_AMDGPU_SI
6000         case CHIP_TAHITI:
6001         case CHIP_PITCAIRN:
6002         case CHIP_VERDE:
6003         case CHIP_OLAND:
6004 #endif
6005 #ifdef CONFIG_DRM_AMDGPU_CIK
6006         case CHIP_BONAIRE:
6007         case CHIP_HAWAII:
6008         case CHIP_KAVERI:
6009         case CHIP_KABINI:
6010         case CHIP_MULLINS:
6011 #endif
6012         case CHIP_TONGA:
6013         case CHIP_FIJI:
6014         case CHIP_POLARIS10:
6015         case CHIP_POLARIS11:
6016         case CHIP_POLARIS12:
6017         case CHIP_VEGAM:
6018         case CHIP_CARRIZO:
6019         case CHIP_STONEY:
6020                 /* chips with display hardware */
6021                 return true;
6022         default:
6023                 /* IP discovery */
6024                 if (!adev->ip_versions[DCE_HWIP][0] ||
6025                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6026                         return false;
6027                 return true;
6028         }
6029 }
This page took 0.394395 seconds and 4 git commands to generate.