]> Git Repo - J-linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
amdgpu: fix build on non-DCN platforms.
[J-linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38
39 #include <drm/drm_aperture.h>
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/vgaarb.h>
45 #include <linux/vga_switcheroo.h>
46 #include <linux/efi.h>
47 #include "amdgpu.h"
48 #include "amdgpu_trace.h"
49 #include "amdgpu_i2c.h"
50 #include "atom.h"
51 #include "amdgpu_atombios.h"
52 #include "amdgpu_atomfirmware.h"
53 #include "amd_pcie.h"
54 #ifdef CONFIG_DRM_AMDGPU_SI
55 #include "si.h"
56 #endif
57 #ifdef CONFIG_DRM_AMDGPU_CIK
58 #include "cik.h"
59 #endif
60 #include "vi.h"
61 #include "soc15.h"
62 #include "nv.h"
63 #include "bif/bif_4_1_d.h"
64 #include <linux/firmware.h>
65 #include "amdgpu_vf_error.h"
66
67 #include "amdgpu_amdkfd.h"
68 #include "amdgpu_pm.h"
69
70 #include "amdgpu_xgmi.h"
71 #include "amdgpu_ras.h"
72 #include "amdgpu_pmu.h"
73 #include "amdgpu_fru_eeprom.h"
74 #include "amdgpu_reset.h"
75
76 #include <linux/suspend.h>
77 #include <drm/task_barrier.h>
78 #include <linux/pm_runtime.h>
79
80 #include <drm/drm_drv.h>
81
82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
89
90 #define AMDGPU_RESUME_MS                2000
91 #define AMDGPU_MAX_RETRY_LIMIT          2
92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
93
94 static const struct drm_driver amdgpu_kms_driver;
95
96 const char *amdgpu_asic_name[] = {
97         "TAHITI",
98         "PITCAIRN",
99         "VERDE",
100         "OLAND",
101         "HAINAN",
102         "BONAIRE",
103         "KAVERI",
104         "KABINI",
105         "HAWAII",
106         "MULLINS",
107         "TOPAZ",
108         "TONGA",
109         "FIJI",
110         "CARRIZO",
111         "STONEY",
112         "POLARIS10",
113         "POLARIS11",
114         "POLARIS12",
115         "VEGAM",
116         "VEGA10",
117         "VEGA12",
118         "VEGA20",
119         "RAVEN",
120         "ARCTURUS",
121         "RENOIR",
122         "ALDEBARAN",
123         "NAVI10",
124         "CYAN_SKILLFISH",
125         "NAVI14",
126         "NAVI12",
127         "SIENNA_CICHLID",
128         "NAVY_FLOUNDER",
129         "VANGOGH",
130         "DIMGREY_CAVEFISH",
131         "BEIGE_GOBY",
132         "YELLOW_CARP",
133         "IP DISCOVERY",
134         "LAST",
135 };
136
137 /**
138  * DOC: pcie_replay_count
139  *
140  * The amdgpu driver provides a sysfs API for reporting the total number
141  * of PCIe replays (NAKs)
142  * The file pcie_replay_count is used for this and returns the total
143  * number of replays as a sum of the NAKs generated and NAKs received
144  */
145
146 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
147                 struct device_attribute *attr, char *buf)
148 {
149         struct drm_device *ddev = dev_get_drvdata(dev);
150         struct amdgpu_device *adev = drm_to_adev(ddev);
151         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
152
153         return sysfs_emit(buf, "%llu\n", cnt);
154 }
155
156 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
157                 amdgpu_device_get_pcie_replay_count, NULL);
158
159 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
160
161 /**
162  * DOC: product_name
163  *
164  * The amdgpu driver provides a sysfs API for reporting the product name
165  * for the device
166  * The file serial_number is used for this and returns the product name
167  * as returned from the FRU.
168  * NOTE: This is only available for certain server cards
169  */
170
171 static ssize_t amdgpu_device_get_product_name(struct device *dev,
172                 struct device_attribute *attr, char *buf)
173 {
174         struct drm_device *ddev = dev_get_drvdata(dev);
175         struct amdgpu_device *adev = drm_to_adev(ddev);
176
177         return sysfs_emit(buf, "%s\n", adev->product_name);
178 }
179
180 static DEVICE_ATTR(product_name, S_IRUGO,
181                 amdgpu_device_get_product_name, NULL);
182
183 /**
184  * DOC: product_number
185  *
186  * The amdgpu driver provides a sysfs API for reporting the part number
187  * for the device
188  * The file serial_number is used for this and returns the part number
189  * as returned from the FRU.
190  * NOTE: This is only available for certain server cards
191  */
192
193 static ssize_t amdgpu_device_get_product_number(struct device *dev,
194                 struct device_attribute *attr, char *buf)
195 {
196         struct drm_device *ddev = dev_get_drvdata(dev);
197         struct amdgpu_device *adev = drm_to_adev(ddev);
198
199         return sysfs_emit(buf, "%s\n", adev->product_number);
200 }
201
202 static DEVICE_ATTR(product_number, S_IRUGO,
203                 amdgpu_device_get_product_number, NULL);
204
205 /**
206  * DOC: serial_number
207  *
208  * The amdgpu driver provides a sysfs API for reporting the serial number
209  * for the device
210  * The file serial_number is used for this and returns the serial number
211  * as returned from the FRU.
212  * NOTE: This is only available for certain server cards
213  */
214
215 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
216                 struct device_attribute *attr, char *buf)
217 {
218         struct drm_device *ddev = dev_get_drvdata(dev);
219         struct amdgpu_device *adev = drm_to_adev(ddev);
220
221         return sysfs_emit(buf, "%s\n", adev->serial);
222 }
223
224 static DEVICE_ATTR(serial_number, S_IRUGO,
225                 amdgpu_device_get_serial_number, NULL);
226
227 /**
228  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device is a dGPU with ATPX power control,
233  * otherwise return false.
234  */
235 bool amdgpu_device_supports_px(struct drm_device *dev)
236 {
237         struct amdgpu_device *adev = drm_to_adev(dev);
238
239         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
240                 return true;
241         return false;
242 }
243
244 /**
245  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
246  *
247  * @dev: drm_device pointer
248  *
249  * Returns true if the device is a dGPU with ACPI power control,
250  * otherwise return false.
251  */
252 bool amdgpu_device_supports_boco(struct drm_device *dev)
253 {
254         struct amdgpu_device *adev = drm_to_adev(dev);
255
256         if (adev->has_pr3 ||
257             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
258                 return true;
259         return false;
260 }
261
262 /**
263  * amdgpu_device_supports_baco - Does the device support BACO
264  *
265  * @dev: drm_device pointer
266  *
267  * Returns true if the device supporte BACO,
268  * otherwise return false.
269  */
270 bool amdgpu_device_supports_baco(struct drm_device *dev)
271 {
272         struct amdgpu_device *adev = drm_to_adev(dev);
273
274         return amdgpu_asic_supports_baco(adev);
275 }
276
277 /**
278  * amdgpu_device_supports_smart_shift - Is the device dGPU with
279  * smart shift support
280  *
281  * @dev: drm_device pointer
282  *
283  * Returns true if the device is a dGPU with Smart Shift support,
284  * otherwise returns false.
285  */
286 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
287 {
288         return (amdgpu_device_supports_boco(dev) &&
289                 amdgpu_acpi_is_power_shift_control_supported());
290 }
291
292 /*
293  * VRAM access helper functions
294  */
295
296 /**
297  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
298  *
299  * @adev: amdgpu_device pointer
300  * @pos: offset of the buffer in vram
301  * @buf: virtual address of the buffer in system memory
302  * @size: read/write size, sizeof(@buf) must > @size
303  * @write: true - write to vram, otherwise - read from vram
304  */
305 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
306                              void *buf, size_t size, bool write)
307 {
308         unsigned long flags;
309         uint32_t hi = ~0, tmp = 0;
310         uint32_t *data = buf;
311         uint64_t last;
312         int idx;
313
314         if (!drm_dev_enter(adev_to_drm(adev), &idx))
315                 return;
316
317         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
318
319         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
320         for (last = pos + size; pos < last; pos += 4) {
321                 tmp = pos >> 31;
322
323                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
324                 if (tmp != hi) {
325                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
326                         hi = tmp;
327                 }
328                 if (write)
329                         WREG32_NO_KIQ(mmMM_DATA, *data++);
330                 else
331                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
332         }
333
334         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
335         drm_dev_exit(idx);
336 }
337
338 /**
339  * amdgpu_device_aper_access - access vram by vram aperature
340  *
341  * @adev: amdgpu_device pointer
342  * @pos: offset of the buffer in vram
343  * @buf: virtual address of the buffer in system memory
344  * @size: read/write size, sizeof(@buf) must > @size
345  * @write: true - write to vram, otherwise - read from vram
346  *
347  * The return value means how many bytes have been transferred.
348  */
349 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
350                                  void *buf, size_t size, bool write)
351 {
352 #ifdef CONFIG_64BIT
353         void __iomem *addr;
354         size_t count = 0;
355         uint64_t last;
356
357         if (!adev->mman.aper_base_kaddr)
358                 return 0;
359
360         last = min(pos + size, adev->gmc.visible_vram_size);
361         if (last > pos) {
362                 addr = adev->mman.aper_base_kaddr + pos;
363                 count = last - pos;
364
365                 if (write) {
366                         memcpy_toio(addr, buf, count);
367                         mb();
368                         amdgpu_device_flush_hdp(adev, NULL);
369                 } else {
370                         amdgpu_device_invalidate_hdp(adev, NULL);
371                         mb();
372                         memcpy_fromio(buf, addr, count);
373                 }
374
375         }
376
377         return count;
378 #else
379         return 0;
380 #endif
381 }
382
383 /**
384  * amdgpu_device_vram_access - read/write a buffer in vram
385  *
386  * @adev: amdgpu_device pointer
387  * @pos: offset of the buffer in vram
388  * @buf: virtual address of the buffer in system memory
389  * @size: read/write size, sizeof(@buf) must > @size
390  * @write: true - write to vram, otherwise - read from vram
391  */
392 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
393                                void *buf, size_t size, bool write)
394 {
395         size_t count;
396
397         /* try to using vram apreature to access vram first */
398         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
399         size -= count;
400         if (size) {
401                 /* using MM to access rest vram */
402                 pos += count;
403                 buf += count;
404                 amdgpu_device_mm_access(adev, pos, buf, size, write);
405         }
406 }
407
408 /*
409  * register access helper functions.
410  */
411
412 /* Check if hw access should be skipped because of hotplug or device error */
413 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
414 {
415         if (adev->no_hw_access)
416                 return true;
417
418 #ifdef CONFIG_LOCKDEP
419         /*
420          * This is a bit complicated to understand, so worth a comment. What we assert
421          * here is that the GPU reset is not running on another thread in parallel.
422          *
423          * For this we trylock the read side of the reset semaphore, if that succeeds
424          * we know that the reset is not running in paralell.
425          *
426          * If the trylock fails we assert that we are either already holding the read
427          * side of the lock or are the reset thread itself and hold the write side of
428          * the lock.
429          */
430         if (in_task()) {
431                 if (down_read_trylock(&adev->reset_domain->sem))
432                         up_read(&adev->reset_domain->sem);
433                 else
434                         lockdep_assert_held(&adev->reset_domain->sem);
435         }
436 #endif
437         return false;
438 }
439
440 /**
441  * amdgpu_device_rreg - read a memory mapped IO or indirect register
442  *
443  * @adev: amdgpu_device pointer
444  * @reg: dword aligned register offset
445  * @acc_flags: access flags which require special behavior
446  *
447  * Returns the 32 bit value from the offset specified.
448  */
449 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
450                             uint32_t reg, uint32_t acc_flags)
451 {
452         uint32_t ret;
453
454         if (amdgpu_device_skip_hw_access(adev))
455                 return 0;
456
457         if ((reg * 4) < adev->rmmio_size) {
458                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
459                     amdgpu_sriov_runtime(adev) &&
460                     down_read_trylock(&adev->reset_domain->sem)) {
461                         ret = amdgpu_kiq_rreg(adev, reg);
462                         up_read(&adev->reset_domain->sem);
463                 } else {
464                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
465                 }
466         } else {
467                 ret = adev->pcie_rreg(adev, reg * 4);
468         }
469
470         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
471
472         return ret;
473 }
474
475 /*
476  * MMIO register read with bytes helper functions
477  * @offset:bytes offset from MMIO start
478  *
479 */
480
481 /**
482  * amdgpu_mm_rreg8 - read a memory mapped IO register
483  *
484  * @adev: amdgpu_device pointer
485  * @offset: byte aligned register offset
486  *
487  * Returns the 8 bit value from the offset specified.
488  */
489 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
490 {
491         if (amdgpu_device_skip_hw_access(adev))
492                 return 0;
493
494         if (offset < adev->rmmio_size)
495                 return (readb(adev->rmmio + offset));
496         BUG();
497 }
498
499 /*
500  * MMIO register write with bytes helper functions
501  * @offset:bytes offset from MMIO start
502  * @value: the value want to be written to the register
503  *
504 */
505 /**
506  * amdgpu_mm_wreg8 - read a memory mapped IO register
507  *
508  * @adev: amdgpu_device pointer
509  * @offset: byte aligned register offset
510  * @value: 8 bit value to write
511  *
512  * Writes the value specified to the offset specified.
513  */
514 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
515 {
516         if (amdgpu_device_skip_hw_access(adev))
517                 return;
518
519         if (offset < adev->rmmio_size)
520                 writeb(value, adev->rmmio + offset);
521         else
522                 BUG();
523 }
524
525 /**
526  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
527  *
528  * @adev: amdgpu_device pointer
529  * @reg: dword aligned register offset
530  * @v: 32 bit value to write to the register
531  * @acc_flags: access flags which require special behavior
532  *
533  * Writes the value specified to the offset specified.
534  */
535 void amdgpu_device_wreg(struct amdgpu_device *adev,
536                         uint32_t reg, uint32_t v,
537                         uint32_t acc_flags)
538 {
539         if (amdgpu_device_skip_hw_access(adev))
540                 return;
541
542         if ((reg * 4) < adev->rmmio_size) {
543                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
544                     amdgpu_sriov_runtime(adev) &&
545                     down_read_trylock(&adev->reset_domain->sem)) {
546                         amdgpu_kiq_wreg(adev, reg, v);
547                         up_read(&adev->reset_domain->sem);
548                 } else {
549                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
550                 }
551         } else {
552                 adev->pcie_wreg(adev, reg * 4, v);
553         }
554
555         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
556 }
557
558 /**
559  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
560  *
561  * @adev: amdgpu_device pointer
562  * @reg: mmio/rlc register
563  * @v: value to write
564  *
565  * this function is invoked only for the debugfs register access
566  */
567 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
568                              uint32_t reg, uint32_t v)
569 {
570         if (amdgpu_device_skip_hw_access(adev))
571                 return;
572
573         if (amdgpu_sriov_fullaccess(adev) &&
574             adev->gfx.rlc.funcs &&
575             adev->gfx.rlc.funcs->is_rlcg_access_range) {
576                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
577                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
578         } else if ((reg * 4) >= adev->rmmio_size) {
579                 adev->pcie_wreg(adev, reg * 4, v);
580         } else {
581                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
582         }
583 }
584
585 /**
586  * amdgpu_mm_rdoorbell - read a doorbell dword
587  *
588  * @adev: amdgpu_device pointer
589  * @index: doorbell index
590  *
591  * Returns the value in the doorbell aperture at the
592  * requested doorbell index (CIK).
593  */
594 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
595 {
596         if (amdgpu_device_skip_hw_access(adev))
597                 return 0;
598
599         if (index < adev->doorbell.num_doorbells) {
600                 return readl(adev->doorbell.ptr + index);
601         } else {
602                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
603                 return 0;
604         }
605 }
606
607 /**
608  * amdgpu_mm_wdoorbell - write a doorbell dword
609  *
610  * @adev: amdgpu_device pointer
611  * @index: doorbell index
612  * @v: value to write
613  *
614  * Writes @v to the doorbell aperture at the
615  * requested doorbell index (CIK).
616  */
617 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
618 {
619         if (amdgpu_device_skip_hw_access(adev))
620                 return;
621
622         if (index < adev->doorbell.num_doorbells) {
623                 writel(v, adev->doorbell.ptr + index);
624         } else {
625                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
626         }
627 }
628
629 /**
630  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
631  *
632  * @adev: amdgpu_device pointer
633  * @index: doorbell index
634  *
635  * Returns the value in the doorbell aperture at the
636  * requested doorbell index (VEGA10+).
637  */
638 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
639 {
640         if (amdgpu_device_skip_hw_access(adev))
641                 return 0;
642
643         if (index < adev->doorbell.num_doorbells) {
644                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
645         } else {
646                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
647                 return 0;
648         }
649 }
650
651 /**
652  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
653  *
654  * @adev: amdgpu_device pointer
655  * @index: doorbell index
656  * @v: value to write
657  *
658  * Writes @v to the doorbell aperture at the
659  * requested doorbell index (VEGA10+).
660  */
661 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
662 {
663         if (amdgpu_device_skip_hw_access(adev))
664                 return;
665
666         if (index < adev->doorbell.num_doorbells) {
667                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
668         } else {
669                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
670         }
671 }
672
673 /**
674  * amdgpu_device_indirect_rreg - read an indirect register
675  *
676  * @adev: amdgpu_device pointer
677  * @pcie_index: mmio register offset
678  * @pcie_data: mmio register offset
679  * @reg_addr: indirect register address to read from
680  *
681  * Returns the value of indirect register @reg_addr
682  */
683 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
684                                 u32 pcie_index, u32 pcie_data,
685                                 u32 reg_addr)
686 {
687         unsigned long flags;
688         u32 r;
689         void __iomem *pcie_index_offset;
690         void __iomem *pcie_data_offset;
691
692         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
693         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
694         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
695
696         writel(reg_addr, pcie_index_offset);
697         readl(pcie_index_offset);
698         r = readl(pcie_data_offset);
699         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
700
701         return r;
702 }
703
704 /**
705  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
706  *
707  * @adev: amdgpu_device pointer
708  * @pcie_index: mmio register offset
709  * @pcie_data: mmio register offset
710  * @reg_addr: indirect register address to read from
711  *
712  * Returns the value of indirect register @reg_addr
713  */
714 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
715                                   u32 pcie_index, u32 pcie_data,
716                                   u32 reg_addr)
717 {
718         unsigned long flags;
719         u64 r;
720         void __iomem *pcie_index_offset;
721         void __iomem *pcie_data_offset;
722
723         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
724         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
725         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
726
727         /* read low 32 bits */
728         writel(reg_addr, pcie_index_offset);
729         readl(pcie_index_offset);
730         r = readl(pcie_data_offset);
731         /* read high 32 bits */
732         writel(reg_addr + 4, pcie_index_offset);
733         readl(pcie_index_offset);
734         r |= ((u64)readl(pcie_data_offset) << 32);
735         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
736
737         return r;
738 }
739
740 /**
741  * amdgpu_device_indirect_wreg - write an indirect register address
742  *
743  * @adev: amdgpu_device pointer
744  * @pcie_index: mmio register offset
745  * @pcie_data: mmio register offset
746  * @reg_addr: indirect register offset
747  * @reg_data: indirect register data
748  *
749  */
750 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
751                                  u32 pcie_index, u32 pcie_data,
752                                  u32 reg_addr, u32 reg_data)
753 {
754         unsigned long flags;
755         void __iomem *pcie_index_offset;
756         void __iomem *pcie_data_offset;
757
758         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
759         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
760         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
761
762         writel(reg_addr, pcie_index_offset);
763         readl(pcie_index_offset);
764         writel(reg_data, pcie_data_offset);
765         readl(pcie_data_offset);
766         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
767 }
768
769 /**
770  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
771  *
772  * @adev: amdgpu_device pointer
773  * @pcie_index: mmio register offset
774  * @pcie_data: mmio register offset
775  * @reg_addr: indirect register offset
776  * @reg_data: indirect register data
777  *
778  */
779 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
780                                    u32 pcie_index, u32 pcie_data,
781                                    u32 reg_addr, u64 reg_data)
782 {
783         unsigned long flags;
784         void __iomem *pcie_index_offset;
785         void __iomem *pcie_data_offset;
786
787         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
788         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
789         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
790
791         /* write low 32 bits */
792         writel(reg_addr, pcie_index_offset);
793         readl(pcie_index_offset);
794         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
795         readl(pcie_data_offset);
796         /* write high 32 bits */
797         writel(reg_addr + 4, pcie_index_offset);
798         readl(pcie_index_offset);
799         writel((u32)(reg_data >> 32), pcie_data_offset);
800         readl(pcie_data_offset);
801         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
802 }
803
804 /**
805  * amdgpu_invalid_rreg - dummy reg read function
806  *
807  * @adev: amdgpu_device pointer
808  * @reg: offset of register
809  *
810  * Dummy register read function.  Used for register blocks
811  * that certain asics don't have (all asics).
812  * Returns the value in the register.
813  */
814 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
815 {
816         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
817         BUG();
818         return 0;
819 }
820
821 /**
822  * amdgpu_invalid_wreg - dummy reg write function
823  *
824  * @adev: amdgpu_device pointer
825  * @reg: offset of register
826  * @v: value to write to the register
827  *
828  * Dummy register read function.  Used for register blocks
829  * that certain asics don't have (all asics).
830  */
831 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
832 {
833         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
834                   reg, v);
835         BUG();
836 }
837
838 /**
839  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
840  *
841  * @adev: amdgpu_device pointer
842  * @reg: offset of register
843  *
844  * Dummy register read function.  Used for register blocks
845  * that certain asics don't have (all asics).
846  * Returns the value in the register.
847  */
848 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
849 {
850         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
851         BUG();
852         return 0;
853 }
854
855 /**
856  * amdgpu_invalid_wreg64 - dummy reg write function
857  *
858  * @adev: amdgpu_device pointer
859  * @reg: offset of register
860  * @v: value to write to the register
861  *
862  * Dummy register read function.  Used for register blocks
863  * that certain asics don't have (all asics).
864  */
865 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
866 {
867         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
868                   reg, v);
869         BUG();
870 }
871
872 /**
873  * amdgpu_block_invalid_rreg - dummy reg read function
874  *
875  * @adev: amdgpu_device pointer
876  * @block: offset of instance
877  * @reg: offset of register
878  *
879  * Dummy register read function.  Used for register blocks
880  * that certain asics don't have (all asics).
881  * Returns the value in the register.
882  */
883 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
884                                           uint32_t block, uint32_t reg)
885 {
886         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
887                   reg, block);
888         BUG();
889         return 0;
890 }
891
892 /**
893  * amdgpu_block_invalid_wreg - dummy reg write function
894  *
895  * @adev: amdgpu_device pointer
896  * @block: offset of instance
897  * @reg: offset of register
898  * @v: value to write to the register
899  *
900  * Dummy register read function.  Used for register blocks
901  * that certain asics don't have (all asics).
902  */
903 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
904                                       uint32_t block,
905                                       uint32_t reg, uint32_t v)
906 {
907         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
908                   reg, block, v);
909         BUG();
910 }
911
912 /**
913  * amdgpu_device_asic_init - Wrapper for atom asic_init
914  *
915  * @adev: amdgpu_device pointer
916  *
917  * Does any asic specific work and then calls atom asic init.
918  */
919 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
920 {
921         amdgpu_asic_pre_asic_init(adev);
922
923         if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
924                 return amdgpu_atomfirmware_asic_init(adev, true);
925         else
926                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
927 }
928
929 /**
930  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
931  *
932  * @adev: amdgpu_device pointer
933  *
934  * Allocates a scratch page of VRAM for use by various things in the
935  * driver.
936  */
937 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
938 {
939         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
940                                        AMDGPU_GEM_DOMAIN_VRAM |
941                                        AMDGPU_GEM_DOMAIN_GTT,
942                                        &adev->mem_scratch.robj,
943                                        &adev->mem_scratch.gpu_addr,
944                                        (void **)&adev->mem_scratch.ptr);
945 }
946
947 /**
948  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
949  *
950  * @adev: amdgpu_device pointer
951  *
952  * Frees the VRAM scratch page.
953  */
954 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
955 {
956         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
957 }
958
959 /**
960  * amdgpu_device_program_register_sequence - program an array of registers.
961  *
962  * @adev: amdgpu_device pointer
963  * @registers: pointer to the register array
964  * @array_size: size of the register array
965  *
966  * Programs an array or registers with and and or masks.
967  * This is a helper for setting golden registers.
968  */
969 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
970                                              const u32 *registers,
971                                              const u32 array_size)
972 {
973         u32 tmp, reg, and_mask, or_mask;
974         int i;
975
976         if (array_size % 3)
977                 return;
978
979         for (i = 0; i < array_size; i +=3) {
980                 reg = registers[i + 0];
981                 and_mask = registers[i + 1];
982                 or_mask = registers[i + 2];
983
984                 if (and_mask == 0xffffffff) {
985                         tmp = or_mask;
986                 } else {
987                         tmp = RREG32(reg);
988                         tmp &= ~and_mask;
989                         if (adev->family >= AMDGPU_FAMILY_AI)
990                                 tmp |= (or_mask & and_mask);
991                         else
992                                 tmp |= or_mask;
993                 }
994                 WREG32(reg, tmp);
995         }
996 }
997
998 /**
999  * amdgpu_device_pci_config_reset - reset the GPU
1000  *
1001  * @adev: amdgpu_device pointer
1002  *
1003  * Resets the GPU using the pci config reset sequence.
1004  * Only applicable to asics prior to vega10.
1005  */
1006 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1007 {
1008         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1009 }
1010
1011 /**
1012  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1013  *
1014  * @adev: amdgpu_device pointer
1015  *
1016  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1017  */
1018 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1019 {
1020         return pci_reset_function(adev->pdev);
1021 }
1022
1023 /*
1024  * GPU doorbell aperture helpers function.
1025  */
1026 /**
1027  * amdgpu_device_doorbell_init - Init doorbell driver information.
1028  *
1029  * @adev: amdgpu_device pointer
1030  *
1031  * Init doorbell driver information (CIK)
1032  * Returns 0 on success, error on failure.
1033  */
1034 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1035 {
1036
1037         /* No doorbell on SI hardware generation */
1038         if (adev->asic_type < CHIP_BONAIRE) {
1039                 adev->doorbell.base = 0;
1040                 adev->doorbell.size = 0;
1041                 adev->doorbell.num_doorbells = 0;
1042                 adev->doorbell.ptr = NULL;
1043                 return 0;
1044         }
1045
1046         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1047                 return -EINVAL;
1048
1049         amdgpu_asic_init_doorbell_index(adev);
1050
1051         /* doorbell bar mapping */
1052         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1053         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1054
1055         if (adev->enable_mes) {
1056                 adev->doorbell.num_doorbells =
1057                         adev->doorbell.size / sizeof(u32);
1058         } else {
1059                 adev->doorbell.num_doorbells =
1060                         min_t(u32, adev->doorbell.size / sizeof(u32),
1061                               adev->doorbell_index.max_assignment+1);
1062                 if (adev->doorbell.num_doorbells == 0)
1063                         return -EINVAL;
1064
1065                 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1066                  * paging queue doorbell use the second page. The
1067                  * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1068                  * doorbells are in the first page. So with paging queue enabled,
1069                  * the max num_doorbells should + 1 page (0x400 in dword)
1070                  */
1071                 if (adev->asic_type >= CHIP_VEGA10)
1072                         adev->doorbell.num_doorbells += 0x400;
1073         }
1074
1075         adev->doorbell.ptr = ioremap(adev->doorbell.base,
1076                                      adev->doorbell.num_doorbells *
1077                                      sizeof(u32));
1078         if (adev->doorbell.ptr == NULL)
1079                 return -ENOMEM;
1080
1081         return 0;
1082 }
1083
1084 /**
1085  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1086  *
1087  * @adev: amdgpu_device pointer
1088  *
1089  * Tear down doorbell driver information (CIK)
1090  */
1091 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1092 {
1093         iounmap(adev->doorbell.ptr);
1094         adev->doorbell.ptr = NULL;
1095 }
1096
1097
1098
1099 /*
1100  * amdgpu_device_wb_*()
1101  * Writeback is the method by which the GPU updates special pages in memory
1102  * with the status of certain GPU events (fences, ring pointers,etc.).
1103  */
1104
1105 /**
1106  * amdgpu_device_wb_fini - Disable Writeback and free memory
1107  *
1108  * @adev: amdgpu_device pointer
1109  *
1110  * Disables Writeback and frees the Writeback memory (all asics).
1111  * Used at driver shutdown.
1112  */
1113 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1114 {
1115         if (adev->wb.wb_obj) {
1116                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1117                                       &adev->wb.gpu_addr,
1118                                       (void **)&adev->wb.wb);
1119                 adev->wb.wb_obj = NULL;
1120         }
1121 }
1122
1123 /**
1124  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1125  *
1126  * @adev: amdgpu_device pointer
1127  *
1128  * Initializes writeback and allocates writeback memory (all asics).
1129  * Used at driver startup.
1130  * Returns 0 on success or an -error on failure.
1131  */
1132 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1133 {
1134         int r;
1135
1136         if (adev->wb.wb_obj == NULL) {
1137                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1138                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1139                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1140                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1141                                             (void **)&adev->wb.wb);
1142                 if (r) {
1143                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1144                         return r;
1145                 }
1146
1147                 adev->wb.num_wb = AMDGPU_MAX_WB;
1148                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1149
1150                 /* clear wb memory */
1151                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1152         }
1153
1154         return 0;
1155 }
1156
1157 /**
1158  * amdgpu_device_wb_get - Allocate a wb entry
1159  *
1160  * @adev: amdgpu_device pointer
1161  * @wb: wb index
1162  *
1163  * Allocate a wb slot for use by the driver (all asics).
1164  * Returns 0 on success or -EINVAL on failure.
1165  */
1166 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1167 {
1168         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1169
1170         if (offset < adev->wb.num_wb) {
1171                 __set_bit(offset, adev->wb.used);
1172                 *wb = offset << 3; /* convert to dw offset */
1173                 return 0;
1174         } else {
1175                 return -EINVAL;
1176         }
1177 }
1178
1179 /**
1180  * amdgpu_device_wb_free - Free a wb entry
1181  *
1182  * @adev: amdgpu_device pointer
1183  * @wb: wb index
1184  *
1185  * Free a wb slot allocated for use by the driver (all asics)
1186  */
1187 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1188 {
1189         wb >>= 3;
1190         if (wb < adev->wb.num_wb)
1191                 __clear_bit(wb, adev->wb.used);
1192 }
1193
1194 /**
1195  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1196  *
1197  * @adev: amdgpu_device pointer
1198  *
1199  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1200  * to fail, but if any of the BARs is not accessible after the size we abort
1201  * driver loading by returning -ENODEV.
1202  */
1203 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1204 {
1205         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1206         struct pci_bus *root;
1207         struct resource *res;
1208         unsigned i;
1209         u16 cmd;
1210         int r;
1211
1212         /* Bypass for VF */
1213         if (amdgpu_sriov_vf(adev))
1214                 return 0;
1215
1216         /* skip if the bios has already enabled large BAR */
1217         if (adev->gmc.real_vram_size &&
1218             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1219                 return 0;
1220
1221         /* Check if the root BUS has 64bit memory resources */
1222         root = adev->pdev->bus;
1223         while (root->parent)
1224                 root = root->parent;
1225
1226         pci_bus_for_each_resource(root, res, i) {
1227                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1228                     res->start > 0x100000000ull)
1229                         break;
1230         }
1231
1232         /* Trying to resize is pointless without a root hub window above 4GB */
1233         if (!res)
1234                 return 0;
1235
1236         /* Limit the BAR size to what is available */
1237         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1238                         rbar_size);
1239
1240         /* Disable memory decoding while we change the BAR addresses and size */
1241         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1242         pci_write_config_word(adev->pdev, PCI_COMMAND,
1243                               cmd & ~PCI_COMMAND_MEMORY);
1244
1245         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1246         amdgpu_device_doorbell_fini(adev);
1247         if (adev->asic_type >= CHIP_BONAIRE)
1248                 pci_release_resource(adev->pdev, 2);
1249
1250         pci_release_resource(adev->pdev, 0);
1251
1252         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1253         if (r == -ENOSPC)
1254                 DRM_INFO("Not enough PCI address space for a large BAR.");
1255         else if (r && r != -ENOTSUPP)
1256                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1257
1258         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1259
1260         /* When the doorbell or fb BAR isn't available we have no chance of
1261          * using the device.
1262          */
1263         r = amdgpu_device_doorbell_init(adev);
1264         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1265                 return -ENODEV;
1266
1267         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1268
1269         return 0;
1270 }
1271
1272 /*
1273  * GPU helpers function.
1274  */
1275 /**
1276  * amdgpu_device_need_post - check if the hw need post or not
1277  *
1278  * @adev: amdgpu_device pointer
1279  *
1280  * Check if the asic has been initialized (all asics) at driver startup
1281  * or post is needed if  hw reset is performed.
1282  * Returns true if need or false if not.
1283  */
1284 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1285 {
1286         uint32_t reg;
1287
1288         if (amdgpu_sriov_vf(adev))
1289                 return false;
1290
1291         if (amdgpu_passthrough(adev)) {
1292                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1293                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1294                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1295                  * vpost executed for smc version below 22.15
1296                  */
1297                 if (adev->asic_type == CHIP_FIJI) {
1298                         int err;
1299                         uint32_t fw_ver;
1300                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1301                         /* force vPost if error occured */
1302                         if (err)
1303                                 return true;
1304
1305                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1306                         if (fw_ver < 0x00160e00)
1307                                 return true;
1308                 }
1309         }
1310
1311         /* Don't post if we need to reset whole hive on init */
1312         if (adev->gmc.xgmi.pending_reset)
1313                 return false;
1314
1315         if (adev->has_hw_reset) {
1316                 adev->has_hw_reset = false;
1317                 return true;
1318         }
1319
1320         /* bios scratch used on CIK+ */
1321         if (adev->asic_type >= CHIP_BONAIRE)
1322                 return amdgpu_atombios_scratch_need_asic_init(adev);
1323
1324         /* check MEM_SIZE for older asics */
1325         reg = amdgpu_asic_get_config_memsize(adev);
1326
1327         if ((reg != 0) && (reg != 0xffffffff))
1328                 return false;
1329
1330         return true;
1331 }
1332
1333 /**
1334  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1335  *
1336  * @adev: amdgpu_device pointer
1337  *
1338  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1339  * be set for this device.
1340  *
1341  * Returns true if it should be used or false if not.
1342  */
1343 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1344 {
1345         switch (amdgpu_aspm) {
1346         case -1:
1347                 break;
1348         case 0:
1349                 return false;
1350         case 1:
1351                 return true;
1352         default:
1353                 return false;
1354         }
1355         return pcie_aspm_enabled(adev->pdev);
1356 }
1357
1358 /* if we get transitioned to only one device, take VGA back */
1359 /**
1360  * amdgpu_device_vga_set_decode - enable/disable vga decode
1361  *
1362  * @pdev: PCI device pointer
1363  * @state: enable/disable vga decode
1364  *
1365  * Enable/disable vga decode (all asics).
1366  * Returns VGA resource flags.
1367  */
1368 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1369                 bool state)
1370 {
1371         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1372         amdgpu_asic_set_vga_state(adev, state);
1373         if (state)
1374                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1375                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1376         else
1377                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1378 }
1379
1380 /**
1381  * amdgpu_device_check_block_size - validate the vm block size
1382  *
1383  * @adev: amdgpu_device pointer
1384  *
1385  * Validates the vm block size specified via module parameter.
1386  * The vm block size defines number of bits in page table versus page directory,
1387  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1388  * page table and the remaining bits are in the page directory.
1389  */
1390 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1391 {
1392         /* defines number of bits in page table versus page directory,
1393          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1394          * page table and the remaining bits are in the page directory */
1395         if (amdgpu_vm_block_size == -1)
1396                 return;
1397
1398         if (amdgpu_vm_block_size < 9) {
1399                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1400                          amdgpu_vm_block_size);
1401                 amdgpu_vm_block_size = -1;
1402         }
1403 }
1404
1405 /**
1406  * amdgpu_device_check_vm_size - validate the vm size
1407  *
1408  * @adev: amdgpu_device pointer
1409  *
1410  * Validates the vm size in GB specified via module parameter.
1411  * The VM size is the size of the GPU virtual memory space in GB.
1412  */
1413 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1414 {
1415         /* no need to check the default value */
1416         if (amdgpu_vm_size == -1)
1417                 return;
1418
1419         if (amdgpu_vm_size < 1) {
1420                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1421                          amdgpu_vm_size);
1422                 amdgpu_vm_size = -1;
1423         }
1424 }
1425
1426 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1427 {
1428         struct sysinfo si;
1429         bool is_os_64 = (sizeof(void *) == 8);
1430         uint64_t total_memory;
1431         uint64_t dram_size_seven_GB = 0x1B8000000;
1432         uint64_t dram_size_three_GB = 0xB8000000;
1433
1434         if (amdgpu_smu_memory_pool_size == 0)
1435                 return;
1436
1437         if (!is_os_64) {
1438                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1439                 goto def_value;
1440         }
1441         si_meminfo(&si);
1442         total_memory = (uint64_t)si.totalram * si.mem_unit;
1443
1444         if ((amdgpu_smu_memory_pool_size == 1) ||
1445                 (amdgpu_smu_memory_pool_size == 2)) {
1446                 if (total_memory < dram_size_three_GB)
1447                         goto def_value1;
1448         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1449                 (amdgpu_smu_memory_pool_size == 8)) {
1450                 if (total_memory < dram_size_seven_GB)
1451                         goto def_value1;
1452         } else {
1453                 DRM_WARN("Smu memory pool size not supported\n");
1454                 goto def_value;
1455         }
1456         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1457
1458         return;
1459
1460 def_value1:
1461         DRM_WARN("No enough system memory\n");
1462 def_value:
1463         adev->pm.smu_prv_buffer_size = 0;
1464 }
1465
1466 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1467 {
1468         if (!(adev->flags & AMD_IS_APU) ||
1469             adev->asic_type < CHIP_RAVEN)
1470                 return 0;
1471
1472         switch (adev->asic_type) {
1473         case CHIP_RAVEN:
1474                 if (adev->pdev->device == 0x15dd)
1475                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1476                 if (adev->pdev->device == 0x15d8)
1477                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1478                 break;
1479         case CHIP_RENOIR:
1480                 if ((adev->pdev->device == 0x1636) ||
1481                     (adev->pdev->device == 0x164c))
1482                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1483                 else
1484                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1485                 break;
1486         case CHIP_VANGOGH:
1487                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1488                 break;
1489         case CHIP_YELLOW_CARP:
1490                 break;
1491         case CHIP_CYAN_SKILLFISH:
1492                 if ((adev->pdev->device == 0x13FE) ||
1493                     (adev->pdev->device == 0x143F))
1494                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1495                 break;
1496         default:
1497                 break;
1498         }
1499
1500         return 0;
1501 }
1502
1503 /**
1504  * amdgpu_device_check_arguments - validate module params
1505  *
1506  * @adev: amdgpu_device pointer
1507  *
1508  * Validates certain module parameters and updates
1509  * the associated values used by the driver (all asics).
1510  */
1511 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1512 {
1513         if (amdgpu_sched_jobs < 4) {
1514                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1515                          amdgpu_sched_jobs);
1516                 amdgpu_sched_jobs = 4;
1517         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1518                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1519                          amdgpu_sched_jobs);
1520                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1521         }
1522
1523         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1524                 /* gart size must be greater or equal to 32M */
1525                 dev_warn(adev->dev, "gart size (%d) too small\n",
1526                          amdgpu_gart_size);
1527                 amdgpu_gart_size = -1;
1528         }
1529
1530         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1531                 /* gtt size must be greater or equal to 32M */
1532                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1533                                  amdgpu_gtt_size);
1534                 amdgpu_gtt_size = -1;
1535         }
1536
1537         /* valid range is between 4 and 9 inclusive */
1538         if (amdgpu_vm_fragment_size != -1 &&
1539             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1540                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1541                 amdgpu_vm_fragment_size = -1;
1542         }
1543
1544         if (amdgpu_sched_hw_submission < 2) {
1545                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1546                          amdgpu_sched_hw_submission);
1547                 amdgpu_sched_hw_submission = 2;
1548         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1549                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1550                          amdgpu_sched_hw_submission);
1551                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1552         }
1553
1554         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1555                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1556                 amdgpu_reset_method = -1;
1557         }
1558
1559         amdgpu_device_check_smu_prv_buffer_size(adev);
1560
1561         amdgpu_device_check_vm_size(adev);
1562
1563         amdgpu_device_check_block_size(adev);
1564
1565         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1566
1567         return 0;
1568 }
1569
1570 /**
1571  * amdgpu_switcheroo_set_state - set switcheroo state
1572  *
1573  * @pdev: pci dev pointer
1574  * @state: vga_switcheroo state
1575  *
1576  * Callback for the switcheroo driver.  Suspends or resumes
1577  * the asics before or after it is powered up using ACPI methods.
1578  */
1579 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1580                                         enum vga_switcheroo_state state)
1581 {
1582         struct drm_device *dev = pci_get_drvdata(pdev);
1583         int r;
1584
1585         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1586                 return;
1587
1588         if (state == VGA_SWITCHEROO_ON) {
1589                 pr_info("switched on\n");
1590                 /* don't suspend or resume card normally */
1591                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1592
1593                 pci_set_power_state(pdev, PCI_D0);
1594                 amdgpu_device_load_pci_state(pdev);
1595                 r = pci_enable_device(pdev);
1596                 if (r)
1597                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1598                 amdgpu_device_resume(dev, true);
1599
1600                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1601         } else {
1602                 pr_info("switched off\n");
1603                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1604                 amdgpu_device_suspend(dev, true);
1605                 amdgpu_device_cache_pci_state(pdev);
1606                 /* Shut down the device */
1607                 pci_disable_device(pdev);
1608                 pci_set_power_state(pdev, PCI_D3cold);
1609                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1610         }
1611 }
1612
1613 /**
1614  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1615  *
1616  * @pdev: pci dev pointer
1617  *
1618  * Callback for the switcheroo driver.  Check of the switcheroo
1619  * state can be changed.
1620  * Returns true if the state can be changed, false if not.
1621  */
1622 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1623 {
1624         struct drm_device *dev = pci_get_drvdata(pdev);
1625
1626         /*
1627         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1628         * locking inversion with the driver load path. And the access here is
1629         * completely racy anyway. So don't bother with locking for now.
1630         */
1631         return atomic_read(&dev->open_count) == 0;
1632 }
1633
1634 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1635         .set_gpu_state = amdgpu_switcheroo_set_state,
1636         .reprobe = NULL,
1637         .can_switch = amdgpu_switcheroo_can_switch,
1638 };
1639
1640 /**
1641  * amdgpu_device_ip_set_clockgating_state - set the CG state
1642  *
1643  * @dev: amdgpu_device pointer
1644  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1645  * @state: clockgating state (gate or ungate)
1646  *
1647  * Sets the requested clockgating state for all instances of
1648  * the hardware IP specified.
1649  * Returns the error code from the last instance.
1650  */
1651 int amdgpu_device_ip_set_clockgating_state(void *dev,
1652                                            enum amd_ip_block_type block_type,
1653                                            enum amd_clockgating_state state)
1654 {
1655         struct amdgpu_device *adev = dev;
1656         int i, r = 0;
1657
1658         for (i = 0; i < adev->num_ip_blocks; i++) {
1659                 if (!adev->ip_blocks[i].status.valid)
1660                         continue;
1661                 if (adev->ip_blocks[i].version->type != block_type)
1662                         continue;
1663                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1664                         continue;
1665                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1666                         (void *)adev, state);
1667                 if (r)
1668                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1669                                   adev->ip_blocks[i].version->funcs->name, r);
1670         }
1671         return r;
1672 }
1673
1674 /**
1675  * amdgpu_device_ip_set_powergating_state - set the PG state
1676  *
1677  * @dev: amdgpu_device pointer
1678  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1679  * @state: powergating state (gate or ungate)
1680  *
1681  * Sets the requested powergating state for all instances of
1682  * the hardware IP specified.
1683  * Returns the error code from the last instance.
1684  */
1685 int amdgpu_device_ip_set_powergating_state(void *dev,
1686                                            enum amd_ip_block_type block_type,
1687                                            enum amd_powergating_state state)
1688 {
1689         struct amdgpu_device *adev = dev;
1690         int i, r = 0;
1691
1692         for (i = 0; i < adev->num_ip_blocks; i++) {
1693                 if (!adev->ip_blocks[i].status.valid)
1694                         continue;
1695                 if (adev->ip_blocks[i].version->type != block_type)
1696                         continue;
1697                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1698                         continue;
1699                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1700                         (void *)adev, state);
1701                 if (r)
1702                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1703                                   adev->ip_blocks[i].version->funcs->name, r);
1704         }
1705         return r;
1706 }
1707
1708 /**
1709  * amdgpu_device_ip_get_clockgating_state - get the CG state
1710  *
1711  * @adev: amdgpu_device pointer
1712  * @flags: clockgating feature flags
1713  *
1714  * Walks the list of IPs on the device and updates the clockgating
1715  * flags for each IP.
1716  * Updates @flags with the feature flags for each hardware IP where
1717  * clockgating is enabled.
1718  */
1719 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1720                                             u64 *flags)
1721 {
1722         int i;
1723
1724         for (i = 0; i < adev->num_ip_blocks; i++) {
1725                 if (!adev->ip_blocks[i].status.valid)
1726                         continue;
1727                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1728                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1729         }
1730 }
1731
1732 /**
1733  * amdgpu_device_ip_wait_for_idle - wait for idle
1734  *
1735  * @adev: amdgpu_device pointer
1736  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1737  *
1738  * Waits for the request hardware IP to be idle.
1739  * Returns 0 for success or a negative error code on failure.
1740  */
1741 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1742                                    enum amd_ip_block_type block_type)
1743 {
1744         int i, r;
1745
1746         for (i = 0; i < adev->num_ip_blocks; i++) {
1747                 if (!adev->ip_blocks[i].status.valid)
1748                         continue;
1749                 if (adev->ip_blocks[i].version->type == block_type) {
1750                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1751                         if (r)
1752                                 return r;
1753                         break;
1754                 }
1755         }
1756         return 0;
1757
1758 }
1759
1760 /**
1761  * amdgpu_device_ip_is_idle - is the hardware IP idle
1762  *
1763  * @adev: amdgpu_device pointer
1764  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1765  *
1766  * Check if the hardware IP is idle or not.
1767  * Returns true if it the IP is idle, false if not.
1768  */
1769 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1770                               enum amd_ip_block_type block_type)
1771 {
1772         int i;
1773
1774         for (i = 0; i < adev->num_ip_blocks; i++) {
1775                 if (!adev->ip_blocks[i].status.valid)
1776                         continue;
1777                 if (adev->ip_blocks[i].version->type == block_type)
1778                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1779         }
1780         return true;
1781
1782 }
1783
1784 /**
1785  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1786  *
1787  * @adev: amdgpu_device pointer
1788  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1789  *
1790  * Returns a pointer to the hardware IP block structure
1791  * if it exists for the asic, otherwise NULL.
1792  */
1793 struct amdgpu_ip_block *
1794 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1795                               enum amd_ip_block_type type)
1796 {
1797         int i;
1798
1799         for (i = 0; i < adev->num_ip_blocks; i++)
1800                 if (adev->ip_blocks[i].version->type == type)
1801                         return &adev->ip_blocks[i];
1802
1803         return NULL;
1804 }
1805
1806 /**
1807  * amdgpu_device_ip_block_version_cmp
1808  *
1809  * @adev: amdgpu_device pointer
1810  * @type: enum amd_ip_block_type
1811  * @major: major version
1812  * @minor: minor version
1813  *
1814  * return 0 if equal or greater
1815  * return 1 if smaller or the ip_block doesn't exist
1816  */
1817 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1818                                        enum amd_ip_block_type type,
1819                                        u32 major, u32 minor)
1820 {
1821         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1822
1823         if (ip_block && ((ip_block->version->major > major) ||
1824                         ((ip_block->version->major == major) &&
1825                         (ip_block->version->minor >= minor))))
1826                 return 0;
1827
1828         return 1;
1829 }
1830
1831 /**
1832  * amdgpu_device_ip_block_add
1833  *
1834  * @adev: amdgpu_device pointer
1835  * @ip_block_version: pointer to the IP to add
1836  *
1837  * Adds the IP block driver information to the collection of IPs
1838  * on the asic.
1839  */
1840 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1841                                const struct amdgpu_ip_block_version *ip_block_version)
1842 {
1843         if (!ip_block_version)
1844                 return -EINVAL;
1845
1846         switch (ip_block_version->type) {
1847         case AMD_IP_BLOCK_TYPE_VCN:
1848                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1849                         return 0;
1850                 break;
1851         case AMD_IP_BLOCK_TYPE_JPEG:
1852                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1853                         return 0;
1854                 break;
1855         default:
1856                 break;
1857         }
1858
1859         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1860                   ip_block_version->funcs->name);
1861
1862         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1863
1864         return 0;
1865 }
1866
1867 /**
1868  * amdgpu_device_enable_virtual_display - enable virtual display feature
1869  *
1870  * @adev: amdgpu_device pointer
1871  *
1872  * Enabled the virtual display feature if the user has enabled it via
1873  * the module parameter virtual_display.  This feature provides a virtual
1874  * display hardware on headless boards or in virtualized environments.
1875  * This function parses and validates the configuration string specified by
1876  * the user and configues the virtual display configuration (number of
1877  * virtual connectors, crtcs, etc.) specified.
1878  */
1879 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1880 {
1881         adev->enable_virtual_display = false;
1882
1883         if (amdgpu_virtual_display) {
1884                 const char *pci_address_name = pci_name(adev->pdev);
1885                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1886
1887                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1888                 pciaddstr_tmp = pciaddstr;
1889                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1890                         pciaddname = strsep(&pciaddname_tmp, ",");
1891                         if (!strcmp("all", pciaddname)
1892                             || !strcmp(pci_address_name, pciaddname)) {
1893                                 long num_crtc;
1894                                 int res = -1;
1895
1896                                 adev->enable_virtual_display = true;
1897
1898                                 if (pciaddname_tmp)
1899                                         res = kstrtol(pciaddname_tmp, 10,
1900                                                       &num_crtc);
1901
1902                                 if (!res) {
1903                                         if (num_crtc < 1)
1904                                                 num_crtc = 1;
1905                                         if (num_crtc > 6)
1906                                                 num_crtc = 6;
1907                                         adev->mode_info.num_crtc = num_crtc;
1908                                 } else {
1909                                         adev->mode_info.num_crtc = 1;
1910                                 }
1911                                 break;
1912                         }
1913                 }
1914
1915                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1916                          amdgpu_virtual_display, pci_address_name,
1917                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1918
1919                 kfree(pciaddstr);
1920         }
1921 }
1922
1923 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1924 {
1925         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1926                 adev->mode_info.num_crtc = 1;
1927                 adev->enable_virtual_display = true;
1928                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1929                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1930         }
1931 }
1932
1933 /**
1934  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1935  *
1936  * @adev: amdgpu_device pointer
1937  *
1938  * Parses the asic configuration parameters specified in the gpu info
1939  * firmware and makes them availale to the driver for use in configuring
1940  * the asic.
1941  * Returns 0 on success, -EINVAL on failure.
1942  */
1943 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1944 {
1945         const char *chip_name;
1946         char fw_name[40];
1947         int err;
1948         const struct gpu_info_firmware_header_v1_0 *hdr;
1949
1950         adev->firmware.gpu_info_fw = NULL;
1951
1952         if (adev->mman.discovery_bin) {
1953                 /*
1954                  * FIXME: The bounding box is still needed by Navi12, so
1955                  * temporarily read it from gpu_info firmware. Should be dropped
1956                  * when DAL no longer needs it.
1957                  */
1958                 if (adev->asic_type != CHIP_NAVI12)
1959                         return 0;
1960         }
1961
1962         switch (adev->asic_type) {
1963         default:
1964                 return 0;
1965         case CHIP_VEGA10:
1966                 chip_name = "vega10";
1967                 break;
1968         case CHIP_VEGA12:
1969                 chip_name = "vega12";
1970                 break;
1971         case CHIP_RAVEN:
1972                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1973                         chip_name = "raven2";
1974                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1975                         chip_name = "picasso";
1976                 else
1977                         chip_name = "raven";
1978                 break;
1979         case CHIP_ARCTURUS:
1980                 chip_name = "arcturus";
1981                 break;
1982         case CHIP_NAVI12:
1983                 chip_name = "navi12";
1984                 break;
1985         }
1986
1987         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1988         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1989         if (err) {
1990                 dev_err(adev->dev,
1991                         "Failed to get gpu_info firmware \"%s\"\n",
1992                         fw_name);
1993                 goto out;
1994         }
1995
1996         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1997         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1998
1999         switch (hdr->version_major) {
2000         case 1:
2001         {
2002                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2003                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2004                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2005
2006                 /*
2007                  * Should be droped when DAL no longer needs it.
2008                  */
2009                 if (adev->asic_type == CHIP_NAVI12)
2010                         goto parse_soc_bounding_box;
2011
2012                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2013                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2014                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2015                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2016                 adev->gfx.config.max_texture_channel_caches =
2017                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2018                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2019                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2020                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2021                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2022                 adev->gfx.config.double_offchip_lds_buf =
2023                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2024                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2025                 adev->gfx.cu_info.max_waves_per_simd =
2026                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2027                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2028                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2029                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2030                 if (hdr->version_minor >= 1) {
2031                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2032                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2033                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2034                         adev->gfx.config.num_sc_per_sh =
2035                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2036                         adev->gfx.config.num_packer_per_sc =
2037                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2038                 }
2039
2040 parse_soc_bounding_box:
2041                 /*
2042                  * soc bounding box info is not integrated in disocovery table,
2043                  * we always need to parse it from gpu info firmware if needed.
2044                  */
2045                 if (hdr->version_minor == 2) {
2046                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2047                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2048                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2049                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2050                 }
2051                 break;
2052         }
2053         default:
2054                 dev_err(adev->dev,
2055                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2056                 err = -EINVAL;
2057                 goto out;
2058         }
2059 out:
2060         return err;
2061 }
2062
2063 /**
2064  * amdgpu_device_ip_early_init - run early init for hardware IPs
2065  *
2066  * @adev: amdgpu_device pointer
2067  *
2068  * Early initialization pass for hardware IPs.  The hardware IPs that make
2069  * up each asic are discovered each IP's early_init callback is run.  This
2070  * is the first stage in initializing the asic.
2071  * Returns 0 on success, negative error code on failure.
2072  */
2073 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2074 {
2075         struct drm_device *dev = adev_to_drm(adev);
2076         struct pci_dev *parent;
2077         int i, r;
2078         bool total;
2079
2080         amdgpu_device_enable_virtual_display(adev);
2081
2082         if (amdgpu_sriov_vf(adev)) {
2083                 r = amdgpu_virt_request_full_gpu(adev, true);
2084                 if (r)
2085                         return r;
2086         }
2087
2088         switch (adev->asic_type) {
2089 #ifdef CONFIG_DRM_AMDGPU_SI
2090         case CHIP_VERDE:
2091         case CHIP_TAHITI:
2092         case CHIP_PITCAIRN:
2093         case CHIP_OLAND:
2094         case CHIP_HAINAN:
2095                 adev->family = AMDGPU_FAMILY_SI;
2096                 r = si_set_ip_blocks(adev);
2097                 if (r)
2098                         return r;
2099                 break;
2100 #endif
2101 #ifdef CONFIG_DRM_AMDGPU_CIK
2102         case CHIP_BONAIRE:
2103         case CHIP_HAWAII:
2104         case CHIP_KAVERI:
2105         case CHIP_KABINI:
2106         case CHIP_MULLINS:
2107                 if (adev->flags & AMD_IS_APU)
2108                         adev->family = AMDGPU_FAMILY_KV;
2109                 else
2110                         adev->family = AMDGPU_FAMILY_CI;
2111
2112                 r = cik_set_ip_blocks(adev);
2113                 if (r)
2114                         return r;
2115                 break;
2116 #endif
2117         case CHIP_TOPAZ:
2118         case CHIP_TONGA:
2119         case CHIP_FIJI:
2120         case CHIP_POLARIS10:
2121         case CHIP_POLARIS11:
2122         case CHIP_POLARIS12:
2123         case CHIP_VEGAM:
2124         case CHIP_CARRIZO:
2125         case CHIP_STONEY:
2126                 if (adev->flags & AMD_IS_APU)
2127                         adev->family = AMDGPU_FAMILY_CZ;
2128                 else
2129                         adev->family = AMDGPU_FAMILY_VI;
2130
2131                 r = vi_set_ip_blocks(adev);
2132                 if (r)
2133                         return r;
2134                 break;
2135         default:
2136                 r = amdgpu_discovery_set_ip_blocks(adev);
2137                 if (r)
2138                         return r;
2139                 break;
2140         }
2141
2142         if (amdgpu_has_atpx() &&
2143             (amdgpu_is_atpx_hybrid() ||
2144              amdgpu_has_atpx_dgpu_power_cntl()) &&
2145             ((adev->flags & AMD_IS_APU) == 0) &&
2146             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2147                 adev->flags |= AMD_IS_PX;
2148
2149         if (!(adev->flags & AMD_IS_APU)) {
2150                 parent = pci_upstream_bridge(adev->pdev);
2151                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2152         }
2153
2154         amdgpu_amdkfd_device_probe(adev);
2155
2156         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2157         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2158                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2159         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2160                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2161
2162         total = true;
2163         for (i = 0; i < adev->num_ip_blocks; i++) {
2164                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2165                         DRM_ERROR("disabled ip block: %d <%s>\n",
2166                                   i, adev->ip_blocks[i].version->funcs->name);
2167                         adev->ip_blocks[i].status.valid = false;
2168                 } else {
2169                         if (adev->ip_blocks[i].version->funcs->early_init) {
2170                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2171                                 if (r == -ENOENT) {
2172                                         adev->ip_blocks[i].status.valid = false;
2173                                 } else if (r) {
2174                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2175                                                   adev->ip_blocks[i].version->funcs->name, r);
2176                                         total = false;
2177                                 } else {
2178                                         adev->ip_blocks[i].status.valid = true;
2179                                 }
2180                         } else {
2181                                 adev->ip_blocks[i].status.valid = true;
2182                         }
2183                 }
2184                 /* get the vbios after the asic_funcs are set up */
2185                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2186                         r = amdgpu_device_parse_gpu_info_fw(adev);
2187                         if (r)
2188                                 return r;
2189
2190                         /* Read BIOS */
2191                         if (!amdgpu_get_bios(adev))
2192                                 return -EINVAL;
2193
2194                         r = amdgpu_atombios_init(adev);
2195                         if (r) {
2196                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2197                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2198                                 return r;
2199                         }
2200
2201                         /*get pf2vf msg info at it's earliest time*/
2202                         if (amdgpu_sriov_vf(adev))
2203                                 amdgpu_virt_init_data_exchange(adev);
2204
2205                 }
2206         }
2207         if (!total)
2208                 return -ENODEV;
2209
2210         adev->cg_flags &= amdgpu_cg_mask;
2211         adev->pg_flags &= amdgpu_pg_mask;
2212
2213         return 0;
2214 }
2215
2216 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2217 {
2218         int i, r;
2219
2220         for (i = 0; i < adev->num_ip_blocks; i++) {
2221                 if (!adev->ip_blocks[i].status.sw)
2222                         continue;
2223                 if (adev->ip_blocks[i].status.hw)
2224                         continue;
2225                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2226                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2227                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2228                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2229                         if (r) {
2230                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2231                                           adev->ip_blocks[i].version->funcs->name, r);
2232                                 return r;
2233                         }
2234                         adev->ip_blocks[i].status.hw = true;
2235                 }
2236         }
2237
2238         return 0;
2239 }
2240
2241 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2242 {
2243         int i, r;
2244
2245         for (i = 0; i < adev->num_ip_blocks; i++) {
2246                 if (!adev->ip_blocks[i].status.sw)
2247                         continue;
2248                 if (adev->ip_blocks[i].status.hw)
2249                         continue;
2250                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2251                 if (r) {
2252                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2253                                   adev->ip_blocks[i].version->funcs->name, r);
2254                         return r;
2255                 }
2256                 adev->ip_blocks[i].status.hw = true;
2257         }
2258
2259         return 0;
2260 }
2261
2262 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2263 {
2264         int r = 0;
2265         int i;
2266         uint32_t smu_version;
2267
2268         if (adev->asic_type >= CHIP_VEGA10) {
2269                 for (i = 0; i < adev->num_ip_blocks; i++) {
2270                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2271                                 continue;
2272
2273                         if (!adev->ip_blocks[i].status.sw)
2274                                 continue;
2275
2276                         /* no need to do the fw loading again if already done*/
2277                         if (adev->ip_blocks[i].status.hw == true)
2278                                 break;
2279
2280                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2281                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2282                                 if (r) {
2283                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2284                                                           adev->ip_blocks[i].version->funcs->name, r);
2285                                         return r;
2286                                 }
2287                         } else {
2288                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2289                                 if (r) {
2290                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2291                                                           adev->ip_blocks[i].version->funcs->name, r);
2292                                         return r;
2293                                 }
2294                         }
2295
2296                         adev->ip_blocks[i].status.hw = true;
2297                         break;
2298                 }
2299         }
2300
2301         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2302                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2303
2304         return r;
2305 }
2306
2307 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2308 {
2309         long timeout;
2310         int r, i;
2311
2312         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2313                 struct amdgpu_ring *ring = adev->rings[i];
2314
2315                 /* No need to setup the GPU scheduler for rings that don't need it */
2316                 if (!ring || ring->no_scheduler)
2317                         continue;
2318
2319                 switch (ring->funcs->type) {
2320                 case AMDGPU_RING_TYPE_GFX:
2321                         timeout = adev->gfx_timeout;
2322                         break;
2323                 case AMDGPU_RING_TYPE_COMPUTE:
2324                         timeout = adev->compute_timeout;
2325                         break;
2326                 case AMDGPU_RING_TYPE_SDMA:
2327                         timeout = adev->sdma_timeout;
2328                         break;
2329                 default:
2330                         timeout = adev->video_timeout;
2331                         break;
2332                 }
2333
2334                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2335                                    ring->num_hw_submission, amdgpu_job_hang_limit,
2336                                    timeout, adev->reset_domain->wq,
2337                                    ring->sched_score, ring->name,
2338                                    adev->dev);
2339                 if (r) {
2340                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2341                                   ring->name);
2342                         return r;
2343                 }
2344         }
2345
2346         return 0;
2347 }
2348
2349
2350 /**
2351  * amdgpu_device_ip_init - run init for hardware IPs
2352  *
2353  * @adev: amdgpu_device pointer
2354  *
2355  * Main initialization pass for hardware IPs.  The list of all the hardware
2356  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2357  * are run.  sw_init initializes the software state associated with each IP
2358  * and hw_init initializes the hardware associated with each IP.
2359  * Returns 0 on success, negative error code on failure.
2360  */
2361 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2362 {
2363         int i, r;
2364
2365         r = amdgpu_ras_init(adev);
2366         if (r)
2367                 return r;
2368
2369         for (i = 0; i < adev->num_ip_blocks; i++) {
2370                 if (!adev->ip_blocks[i].status.valid)
2371                         continue;
2372                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2373                 if (r) {
2374                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2375                                   adev->ip_blocks[i].version->funcs->name, r);
2376                         goto init_failed;
2377                 }
2378                 adev->ip_blocks[i].status.sw = true;
2379
2380                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2381                         /* need to do common hw init early so everything is set up for gmc */
2382                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2383                         if (r) {
2384                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2385                                 goto init_failed;
2386                         }
2387                         adev->ip_blocks[i].status.hw = true;
2388                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2389                         /* need to do gmc hw init early so we can allocate gpu mem */
2390                         /* Try to reserve bad pages early */
2391                         if (amdgpu_sriov_vf(adev))
2392                                 amdgpu_virt_exchange_data(adev);
2393
2394                         r = amdgpu_device_mem_scratch_init(adev);
2395                         if (r) {
2396                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2397                                 goto init_failed;
2398                         }
2399                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2400                         if (r) {
2401                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2402                                 goto init_failed;
2403                         }
2404                         r = amdgpu_device_wb_init(adev);
2405                         if (r) {
2406                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2407                                 goto init_failed;
2408                         }
2409                         adev->ip_blocks[i].status.hw = true;
2410
2411                         /* right after GMC hw init, we create CSA */
2412                         if (amdgpu_mcbp) {
2413                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2414                                                                AMDGPU_GEM_DOMAIN_VRAM |
2415                                                                AMDGPU_GEM_DOMAIN_GTT,
2416                                                                AMDGPU_CSA_SIZE);
2417                                 if (r) {
2418                                         DRM_ERROR("allocate CSA failed %d\n", r);
2419                                         goto init_failed;
2420                                 }
2421                         }
2422                 }
2423         }
2424
2425         if (amdgpu_sriov_vf(adev))
2426                 amdgpu_virt_init_data_exchange(adev);
2427
2428         r = amdgpu_ib_pool_init(adev);
2429         if (r) {
2430                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2431                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2432                 goto init_failed;
2433         }
2434
2435         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2436         if (r)
2437                 goto init_failed;
2438
2439         r = amdgpu_device_ip_hw_init_phase1(adev);
2440         if (r)
2441                 goto init_failed;
2442
2443         r = amdgpu_device_fw_loading(adev);
2444         if (r)
2445                 goto init_failed;
2446
2447         r = amdgpu_device_ip_hw_init_phase2(adev);
2448         if (r)
2449                 goto init_failed;
2450
2451         /*
2452          * retired pages will be loaded from eeprom and reserved here,
2453          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2454          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2455          * for I2C communication which only true at this point.
2456          *
2457          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2458          * failure from bad gpu situation and stop amdgpu init process
2459          * accordingly. For other failed cases, it will still release all
2460          * the resource and print error message, rather than returning one
2461          * negative value to upper level.
2462          *
2463          * Note: theoretically, this should be called before all vram allocations
2464          * to protect retired page from abusing
2465          */
2466         r = amdgpu_ras_recovery_init(adev);
2467         if (r)
2468                 goto init_failed;
2469
2470         /**
2471          * In case of XGMI grab extra reference for reset domain for this device
2472          */
2473         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2474                 if (amdgpu_xgmi_add_device(adev) == 0) {
2475                         if (!amdgpu_sriov_vf(adev)) {
2476                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2477
2478                                 if (WARN_ON(!hive)) {
2479                                         r = -ENOENT;
2480                                         goto init_failed;
2481                                 }
2482
2483                                 if (!hive->reset_domain ||
2484                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2485                                         r = -ENOENT;
2486                                         amdgpu_put_xgmi_hive(hive);
2487                                         goto init_failed;
2488                                 }
2489
2490                                 /* Drop the early temporary reset domain we created for device */
2491                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2492                                 adev->reset_domain = hive->reset_domain;
2493                                 amdgpu_put_xgmi_hive(hive);
2494                         }
2495                 }
2496         }
2497
2498         r = amdgpu_device_init_schedulers(adev);
2499         if (r)
2500                 goto init_failed;
2501
2502         /* Don't init kfd if whole hive need to be reset during init */
2503         if (!adev->gmc.xgmi.pending_reset)
2504                 amdgpu_amdkfd_device_init(adev);
2505
2506         amdgpu_fru_get_product_info(adev);
2507
2508 init_failed:
2509         if (amdgpu_sriov_vf(adev))
2510                 amdgpu_virt_release_full_gpu(adev, true);
2511
2512         return r;
2513 }
2514
2515 /**
2516  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2517  *
2518  * @adev: amdgpu_device pointer
2519  *
2520  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2521  * this function before a GPU reset.  If the value is retained after a
2522  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2523  */
2524 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2525 {
2526         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2527 }
2528
2529 /**
2530  * amdgpu_device_check_vram_lost - check if vram is valid
2531  *
2532  * @adev: amdgpu_device pointer
2533  *
2534  * Checks the reset magic value written to the gart pointer in VRAM.
2535  * The driver calls this after a GPU reset to see if the contents of
2536  * VRAM is lost or now.
2537  * returns true if vram is lost, false if not.
2538  */
2539 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2540 {
2541         if (memcmp(adev->gart.ptr, adev->reset_magic,
2542                         AMDGPU_RESET_MAGIC_NUM))
2543                 return true;
2544
2545         if (!amdgpu_in_reset(adev))
2546                 return false;
2547
2548         /*
2549          * For all ASICs with baco/mode1 reset, the VRAM is
2550          * always assumed to be lost.
2551          */
2552         switch (amdgpu_asic_reset_method(adev)) {
2553         case AMD_RESET_METHOD_BACO:
2554         case AMD_RESET_METHOD_MODE1:
2555                 return true;
2556         default:
2557                 return false;
2558         }
2559 }
2560
2561 /**
2562  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2563  *
2564  * @adev: amdgpu_device pointer
2565  * @state: clockgating state (gate or ungate)
2566  *
2567  * The list of all the hardware IPs that make up the asic is walked and the
2568  * set_clockgating_state callbacks are run.
2569  * Late initialization pass enabling clockgating for hardware IPs.
2570  * Fini or suspend, pass disabling clockgating for hardware IPs.
2571  * Returns 0 on success, negative error code on failure.
2572  */
2573
2574 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2575                                enum amd_clockgating_state state)
2576 {
2577         int i, j, r;
2578
2579         if (amdgpu_emu_mode == 1)
2580                 return 0;
2581
2582         for (j = 0; j < adev->num_ip_blocks; j++) {
2583                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2584                 if (!adev->ip_blocks[i].status.late_initialized)
2585                         continue;
2586                 /* skip CG for GFX, SDMA on S0ix */
2587                 if (adev->in_s0ix &&
2588                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2589                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2590                         continue;
2591                 /* skip CG for VCE/UVD, it's handled specially */
2592                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2593                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2594                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2595                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2596                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2597                         /* enable clockgating to save power */
2598                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2599                                                                                      state);
2600                         if (r) {
2601                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2602                                           adev->ip_blocks[i].version->funcs->name, r);
2603                                 return r;
2604                         }
2605                 }
2606         }
2607
2608         return 0;
2609 }
2610
2611 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2612                                enum amd_powergating_state state)
2613 {
2614         int i, j, r;
2615
2616         if (amdgpu_emu_mode == 1)
2617                 return 0;
2618
2619         for (j = 0; j < adev->num_ip_blocks; j++) {
2620                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2621                 if (!adev->ip_blocks[i].status.late_initialized)
2622                         continue;
2623                 /* skip PG for GFX, SDMA on S0ix */
2624                 if (adev->in_s0ix &&
2625                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2626                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2627                         continue;
2628                 /* skip CG for VCE/UVD, it's handled specially */
2629                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2630                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2631                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2632                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2633                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2634                         /* enable powergating to save power */
2635                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2636                                                                                         state);
2637                         if (r) {
2638                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2639                                           adev->ip_blocks[i].version->funcs->name, r);
2640                                 return r;
2641                         }
2642                 }
2643         }
2644         return 0;
2645 }
2646
2647 static int amdgpu_device_enable_mgpu_fan_boost(void)
2648 {
2649         struct amdgpu_gpu_instance *gpu_ins;
2650         struct amdgpu_device *adev;
2651         int i, ret = 0;
2652
2653         mutex_lock(&mgpu_info.mutex);
2654
2655         /*
2656          * MGPU fan boost feature should be enabled
2657          * only when there are two or more dGPUs in
2658          * the system
2659          */
2660         if (mgpu_info.num_dgpu < 2)
2661                 goto out;
2662
2663         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2664                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2665                 adev = gpu_ins->adev;
2666                 if (!(adev->flags & AMD_IS_APU) &&
2667                     !gpu_ins->mgpu_fan_enabled) {
2668                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2669                         if (ret)
2670                                 break;
2671
2672                         gpu_ins->mgpu_fan_enabled = 1;
2673                 }
2674         }
2675
2676 out:
2677         mutex_unlock(&mgpu_info.mutex);
2678
2679         return ret;
2680 }
2681
2682 /**
2683  * amdgpu_device_ip_late_init - run late init for hardware IPs
2684  *
2685  * @adev: amdgpu_device pointer
2686  *
2687  * Late initialization pass for hardware IPs.  The list of all the hardware
2688  * IPs that make up the asic is walked and the late_init callbacks are run.
2689  * late_init covers any special initialization that an IP requires
2690  * after all of the have been initialized or something that needs to happen
2691  * late in the init process.
2692  * Returns 0 on success, negative error code on failure.
2693  */
2694 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2695 {
2696         struct amdgpu_gpu_instance *gpu_instance;
2697         int i = 0, r;
2698
2699         for (i = 0; i < adev->num_ip_blocks; i++) {
2700                 if (!adev->ip_blocks[i].status.hw)
2701                         continue;
2702                 if (adev->ip_blocks[i].version->funcs->late_init) {
2703                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2704                         if (r) {
2705                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2706                                           adev->ip_blocks[i].version->funcs->name, r);
2707                                 return r;
2708                         }
2709                 }
2710                 adev->ip_blocks[i].status.late_initialized = true;
2711         }
2712
2713         r = amdgpu_ras_late_init(adev);
2714         if (r) {
2715                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2716                 return r;
2717         }
2718
2719         amdgpu_ras_set_error_query_ready(adev, true);
2720
2721         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2722         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2723
2724         amdgpu_device_fill_reset_magic(adev);
2725
2726         r = amdgpu_device_enable_mgpu_fan_boost();
2727         if (r)
2728                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2729
2730         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2731         if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2732                                adev->asic_type == CHIP_ALDEBARAN ))
2733                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2734
2735         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2736                 mutex_lock(&mgpu_info.mutex);
2737
2738                 /*
2739                  * Reset device p-state to low as this was booted with high.
2740                  *
2741                  * This should be performed only after all devices from the same
2742                  * hive get initialized.
2743                  *
2744                  * However, it's unknown how many device in the hive in advance.
2745                  * As this is counted one by one during devices initializations.
2746                  *
2747                  * So, we wait for all XGMI interlinked devices initialized.
2748                  * This may bring some delays as those devices may come from
2749                  * different hives. But that should be OK.
2750                  */
2751                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2752                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2753                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2754                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2755                                         continue;
2756
2757                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2758                                                 AMDGPU_XGMI_PSTATE_MIN);
2759                                 if (r) {
2760                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2761                                         break;
2762                                 }
2763                         }
2764                 }
2765
2766                 mutex_unlock(&mgpu_info.mutex);
2767         }
2768
2769         return 0;
2770 }
2771
2772 /**
2773  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2774  *
2775  * @adev: amdgpu_device pointer
2776  *
2777  * For ASICs need to disable SMC first
2778  */
2779 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2780 {
2781         int i, r;
2782
2783         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2784                 return;
2785
2786         for (i = 0; i < adev->num_ip_blocks; i++) {
2787                 if (!adev->ip_blocks[i].status.hw)
2788                         continue;
2789                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2790                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2791                         /* XXX handle errors */
2792                         if (r) {
2793                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2794                                           adev->ip_blocks[i].version->funcs->name, r);
2795                         }
2796                         adev->ip_blocks[i].status.hw = false;
2797                         break;
2798                 }
2799         }
2800 }
2801
2802 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2803 {
2804         int i, r;
2805
2806         for (i = 0; i < adev->num_ip_blocks; i++) {
2807                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2808                         continue;
2809
2810                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2811                 if (r) {
2812                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2813                                   adev->ip_blocks[i].version->funcs->name, r);
2814                 }
2815         }
2816
2817         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2818         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2819
2820         amdgpu_amdkfd_suspend(adev, false);
2821
2822         /* Workaroud for ASICs need to disable SMC first */
2823         amdgpu_device_smu_fini_early(adev);
2824
2825         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2826                 if (!adev->ip_blocks[i].status.hw)
2827                         continue;
2828
2829                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2830                 /* XXX handle errors */
2831                 if (r) {
2832                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2833                                   adev->ip_blocks[i].version->funcs->name, r);
2834                 }
2835
2836                 adev->ip_blocks[i].status.hw = false;
2837         }
2838
2839         if (amdgpu_sriov_vf(adev)) {
2840                 if (amdgpu_virt_release_full_gpu(adev, false))
2841                         DRM_ERROR("failed to release exclusive mode on fini\n");
2842         }
2843
2844         return 0;
2845 }
2846
2847 /**
2848  * amdgpu_device_ip_fini - run fini for hardware IPs
2849  *
2850  * @adev: amdgpu_device pointer
2851  *
2852  * Main teardown pass for hardware IPs.  The list of all the hardware
2853  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2854  * are run.  hw_fini tears down the hardware associated with each IP
2855  * and sw_fini tears down any software state associated with each IP.
2856  * Returns 0 on success, negative error code on failure.
2857  */
2858 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2859 {
2860         int i, r;
2861
2862         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2863                 amdgpu_virt_release_ras_err_handler_data(adev);
2864
2865         if (adev->gmc.xgmi.num_physical_nodes > 1)
2866                 amdgpu_xgmi_remove_device(adev);
2867
2868         amdgpu_amdkfd_device_fini_sw(adev);
2869
2870         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2871                 if (!adev->ip_blocks[i].status.sw)
2872                         continue;
2873
2874                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2875                         amdgpu_ucode_free_bo(adev);
2876                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2877                         amdgpu_device_wb_fini(adev);
2878                         amdgpu_device_mem_scratch_fini(adev);
2879                         amdgpu_ib_pool_fini(adev);
2880                 }
2881
2882                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2883                 /* XXX handle errors */
2884                 if (r) {
2885                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2886                                   adev->ip_blocks[i].version->funcs->name, r);
2887                 }
2888                 adev->ip_blocks[i].status.sw = false;
2889                 adev->ip_blocks[i].status.valid = false;
2890         }
2891
2892         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2893                 if (!adev->ip_blocks[i].status.late_initialized)
2894                         continue;
2895                 if (adev->ip_blocks[i].version->funcs->late_fini)
2896                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2897                 adev->ip_blocks[i].status.late_initialized = false;
2898         }
2899
2900         amdgpu_ras_fini(adev);
2901
2902         return 0;
2903 }
2904
2905 /**
2906  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2907  *
2908  * @work: work_struct.
2909  */
2910 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2911 {
2912         struct amdgpu_device *adev =
2913                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2914         int r;
2915
2916         r = amdgpu_ib_ring_tests(adev);
2917         if (r)
2918                 DRM_ERROR("ib ring test failed (%d).\n", r);
2919 }
2920
2921 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2922 {
2923         struct amdgpu_device *adev =
2924                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2925
2926         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2927         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2928
2929         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2930                 adev->gfx.gfx_off_state = true;
2931 }
2932
2933 /**
2934  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2935  *
2936  * @adev: amdgpu_device pointer
2937  *
2938  * Main suspend function for hardware IPs.  The list of all the hardware
2939  * IPs that make up the asic is walked, clockgating is disabled and the
2940  * suspend callbacks are run.  suspend puts the hardware and software state
2941  * in each IP into a state suitable for suspend.
2942  * Returns 0 on success, negative error code on failure.
2943  */
2944 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2945 {
2946         int i, r;
2947
2948         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2949         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2950
2951         /*
2952          * Per PMFW team's suggestion, driver needs to handle gfxoff
2953          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2954          * scenario. Add the missing df cstate disablement here.
2955          */
2956         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2957                 dev_warn(adev->dev, "Failed to disallow df cstate");
2958
2959         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2960                 if (!adev->ip_blocks[i].status.valid)
2961                         continue;
2962
2963                 /* displays are handled separately */
2964                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2965                         continue;
2966
2967                 /* XXX handle errors */
2968                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2969                 /* XXX handle errors */
2970                 if (r) {
2971                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2972                                   adev->ip_blocks[i].version->funcs->name, r);
2973                         return r;
2974                 }
2975
2976                 adev->ip_blocks[i].status.hw = false;
2977         }
2978
2979         return 0;
2980 }
2981
2982 /**
2983  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2984  *
2985  * @adev: amdgpu_device pointer
2986  *
2987  * Main suspend function for hardware IPs.  The list of all the hardware
2988  * IPs that make up the asic is walked, clockgating is disabled and the
2989  * suspend callbacks are run.  suspend puts the hardware and software state
2990  * in each IP into a state suitable for suspend.
2991  * Returns 0 on success, negative error code on failure.
2992  */
2993 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2994 {
2995         int i, r;
2996
2997         if (adev->in_s0ix)
2998                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2999
3000         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3001                 if (!adev->ip_blocks[i].status.valid)
3002                         continue;
3003                 /* displays are handled in phase1 */
3004                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3005                         continue;
3006                 /* PSP lost connection when err_event_athub occurs */
3007                 if (amdgpu_ras_intr_triggered() &&
3008                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3009                         adev->ip_blocks[i].status.hw = false;
3010                         continue;
3011                 }
3012
3013                 /* skip unnecessary suspend if we do not initialize them yet */
3014                 if (adev->gmc.xgmi.pending_reset &&
3015                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3016                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3017                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3018                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3019                         adev->ip_blocks[i].status.hw = false;
3020                         continue;
3021                 }
3022
3023                 /* skip suspend of gfx/mes and psp for S0ix
3024                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3025                  * like at runtime. PSP is also part of the always on hardware
3026                  * so no need to suspend it.
3027                  */
3028                 if (adev->in_s0ix &&
3029                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3030                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3031                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3032                         continue;
3033
3034                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3035                 if (adev->in_s0ix &&
3036                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3037                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3038                         continue;
3039
3040                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3041                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3042                  * from this location and RLC Autoload automatically also gets loaded
3043                  * from here based on PMFW -> PSP message during re-init sequence.
3044                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
3045                  * the TMR and reload FWs again for IMU enabled APU ASICs.
3046                  */
3047                 if (amdgpu_in_reset(adev) &&
3048                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3049                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3050                         continue;
3051
3052                 /* XXX handle errors */
3053                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3054                 /* XXX handle errors */
3055                 if (r) {
3056                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3057                                   adev->ip_blocks[i].version->funcs->name, r);
3058                 }
3059                 adev->ip_blocks[i].status.hw = false;
3060                 /* handle putting the SMC in the appropriate state */
3061                 if(!amdgpu_sriov_vf(adev)){
3062                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3063                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3064                                 if (r) {
3065                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3066                                                         adev->mp1_state, r);
3067                                         return r;
3068                                 }
3069                         }
3070                 }
3071         }
3072
3073         return 0;
3074 }
3075
3076 /**
3077  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3078  *
3079  * @adev: amdgpu_device pointer
3080  *
3081  * Main suspend function for hardware IPs.  The list of all the hardware
3082  * IPs that make up the asic is walked, clockgating is disabled and the
3083  * suspend callbacks are run.  suspend puts the hardware and software state
3084  * in each IP into a state suitable for suspend.
3085  * Returns 0 on success, negative error code on failure.
3086  */
3087 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3088 {
3089         int r;
3090
3091         if (amdgpu_sriov_vf(adev)) {
3092                 amdgpu_virt_fini_data_exchange(adev);
3093                 amdgpu_virt_request_full_gpu(adev, false);
3094         }
3095
3096         r = amdgpu_device_ip_suspend_phase1(adev);
3097         if (r)
3098                 return r;
3099         r = amdgpu_device_ip_suspend_phase2(adev);
3100
3101         if (amdgpu_sriov_vf(adev))
3102                 amdgpu_virt_release_full_gpu(adev, false);
3103
3104         return r;
3105 }
3106
3107 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3108 {
3109         int i, r;
3110
3111         static enum amd_ip_block_type ip_order[] = {
3112                 AMD_IP_BLOCK_TYPE_COMMON,
3113                 AMD_IP_BLOCK_TYPE_GMC,
3114                 AMD_IP_BLOCK_TYPE_PSP,
3115                 AMD_IP_BLOCK_TYPE_IH,
3116         };
3117
3118         for (i = 0; i < adev->num_ip_blocks; i++) {
3119                 int j;
3120                 struct amdgpu_ip_block *block;
3121
3122                 block = &adev->ip_blocks[i];
3123                 block->status.hw = false;
3124
3125                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3126
3127                         if (block->version->type != ip_order[j] ||
3128                                 !block->status.valid)
3129                                 continue;
3130
3131                         r = block->version->funcs->hw_init(adev);
3132                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3133                         if (r)
3134                                 return r;
3135                         block->status.hw = true;
3136                 }
3137         }
3138
3139         return 0;
3140 }
3141
3142 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3143 {
3144         int i, r;
3145
3146         static enum amd_ip_block_type ip_order[] = {
3147                 AMD_IP_BLOCK_TYPE_SMC,
3148                 AMD_IP_BLOCK_TYPE_DCE,
3149                 AMD_IP_BLOCK_TYPE_GFX,
3150                 AMD_IP_BLOCK_TYPE_SDMA,
3151                 AMD_IP_BLOCK_TYPE_UVD,
3152                 AMD_IP_BLOCK_TYPE_VCE,
3153                 AMD_IP_BLOCK_TYPE_VCN
3154         };
3155
3156         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3157                 int j;
3158                 struct amdgpu_ip_block *block;
3159
3160                 for (j = 0; j < adev->num_ip_blocks; j++) {
3161                         block = &adev->ip_blocks[j];
3162
3163                         if (block->version->type != ip_order[i] ||
3164                                 !block->status.valid ||
3165                                 block->status.hw)
3166                                 continue;
3167
3168                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3169                                 r = block->version->funcs->resume(adev);
3170                         else
3171                                 r = block->version->funcs->hw_init(adev);
3172
3173                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3174                         if (r)
3175                                 return r;
3176                         block->status.hw = true;
3177                 }
3178         }
3179
3180         return 0;
3181 }
3182
3183 /**
3184  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3185  *
3186  * @adev: amdgpu_device pointer
3187  *
3188  * First resume function for hardware IPs.  The list of all the hardware
3189  * IPs that make up the asic is walked and the resume callbacks are run for
3190  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3191  * after a suspend and updates the software state as necessary.  This
3192  * function is also used for restoring the GPU after a GPU reset.
3193  * Returns 0 on success, negative error code on failure.
3194  */
3195 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3196 {
3197         int i, r;
3198
3199         for (i = 0; i < adev->num_ip_blocks; i++) {
3200                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3201                         continue;
3202                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3203                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3204                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3205                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3206
3207                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3208                         if (r) {
3209                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3210                                           adev->ip_blocks[i].version->funcs->name, r);
3211                                 return r;
3212                         }
3213                         adev->ip_blocks[i].status.hw = true;
3214                 }
3215         }
3216
3217         return 0;
3218 }
3219
3220 /**
3221  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3222  *
3223  * @adev: amdgpu_device pointer
3224  *
3225  * First resume function for hardware IPs.  The list of all the hardware
3226  * IPs that make up the asic is walked and the resume callbacks are run for
3227  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3228  * functional state after a suspend and updates the software state as
3229  * necessary.  This function is also used for restoring the GPU after a GPU
3230  * reset.
3231  * Returns 0 on success, negative error code on failure.
3232  */
3233 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3234 {
3235         int i, r;
3236
3237         for (i = 0; i < adev->num_ip_blocks; i++) {
3238                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3239                         continue;
3240                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3241                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3242                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3243                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3244                         continue;
3245                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3246                 if (r) {
3247                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3248                                   adev->ip_blocks[i].version->funcs->name, r);
3249                         return r;
3250                 }
3251                 adev->ip_blocks[i].status.hw = true;
3252         }
3253
3254         return 0;
3255 }
3256
3257 /**
3258  * amdgpu_device_ip_resume - run resume for hardware IPs
3259  *
3260  * @adev: amdgpu_device pointer
3261  *
3262  * Main resume function for hardware IPs.  The hardware IPs
3263  * are split into two resume functions because they are
3264  * are also used in in recovering from a GPU reset and some additional
3265  * steps need to be take between them.  In this case (S3/S4) they are
3266  * run sequentially.
3267  * Returns 0 on success, negative error code on failure.
3268  */
3269 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3270 {
3271         int r;
3272
3273         r = amdgpu_amdkfd_resume_iommu(adev);
3274         if (r)
3275                 return r;
3276
3277         r = amdgpu_device_ip_resume_phase1(adev);
3278         if (r)
3279                 return r;
3280
3281         r = amdgpu_device_fw_loading(adev);
3282         if (r)
3283                 return r;
3284
3285         r = amdgpu_device_ip_resume_phase2(adev);
3286
3287         return r;
3288 }
3289
3290 /**
3291  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3292  *
3293  * @adev: amdgpu_device pointer
3294  *
3295  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3296  */
3297 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3298 {
3299         if (amdgpu_sriov_vf(adev)) {
3300                 if (adev->is_atom_fw) {
3301                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3302                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3303                 } else {
3304                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3305                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3306                 }
3307
3308                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3309                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3310         }
3311 }
3312
3313 /**
3314  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3315  *
3316  * @asic_type: AMD asic type
3317  *
3318  * Check if there is DC (new modesetting infrastructre) support for an asic.
3319  * returns true if DC has support, false if not.
3320  */
3321 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3322 {
3323         switch (asic_type) {
3324 #ifdef CONFIG_DRM_AMDGPU_SI
3325         case CHIP_HAINAN:
3326 #endif
3327         case CHIP_TOPAZ:
3328                 /* chips with no display hardware */
3329                 return false;
3330 #if defined(CONFIG_DRM_AMD_DC)
3331         case CHIP_TAHITI:
3332         case CHIP_PITCAIRN:
3333         case CHIP_VERDE:
3334         case CHIP_OLAND:
3335                 /*
3336                  * We have systems in the wild with these ASICs that require
3337                  * LVDS and VGA support which is not supported with DC.
3338                  *
3339                  * Fallback to the non-DC driver here by default so as not to
3340                  * cause regressions.
3341                  */
3342 #if defined(CONFIG_DRM_AMD_DC_SI)
3343                 return amdgpu_dc > 0;
3344 #else
3345                 return false;
3346 #endif
3347         case CHIP_BONAIRE:
3348         case CHIP_KAVERI:
3349         case CHIP_KABINI:
3350         case CHIP_MULLINS:
3351                 /*
3352                  * We have systems in the wild with these ASICs that require
3353                  * VGA support which is not supported with DC.
3354                  *
3355                  * Fallback to the non-DC driver here by default so as not to
3356                  * cause regressions.
3357                  */
3358                 return amdgpu_dc > 0;
3359         default:
3360                 return amdgpu_dc != 0;
3361 #else
3362         default:
3363                 if (amdgpu_dc > 0)
3364                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3365                                          "but isn't supported by ASIC, ignoring\n");
3366                 return false;
3367 #endif
3368         }
3369 }
3370
3371 /**
3372  * amdgpu_device_has_dc_support - check if dc is supported
3373  *
3374  * @adev: amdgpu_device pointer
3375  *
3376  * Returns true for supported, false for not supported
3377  */
3378 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3379 {
3380         if (adev->enable_virtual_display ||
3381             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3382                 return false;
3383
3384         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3385 }
3386
3387 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3388 {
3389         struct amdgpu_device *adev =
3390                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3391         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3392
3393         /* It's a bug to not have a hive within this function */
3394         if (WARN_ON(!hive))
3395                 return;
3396
3397         /*
3398          * Use task barrier to synchronize all xgmi reset works across the
3399          * hive. task_barrier_enter and task_barrier_exit will block
3400          * until all the threads running the xgmi reset works reach
3401          * those points. task_barrier_full will do both blocks.
3402          */
3403         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3404
3405                 task_barrier_enter(&hive->tb);
3406                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3407
3408                 if (adev->asic_reset_res)
3409                         goto fail;
3410
3411                 task_barrier_exit(&hive->tb);
3412                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3413
3414                 if (adev->asic_reset_res)
3415                         goto fail;
3416
3417                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3418                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3419                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3420         } else {
3421
3422                 task_barrier_full(&hive->tb);
3423                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3424         }
3425
3426 fail:
3427         if (adev->asic_reset_res)
3428                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3429                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3430         amdgpu_put_xgmi_hive(hive);
3431 }
3432
3433 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3434 {
3435         char *input = amdgpu_lockup_timeout;
3436         char *timeout_setting = NULL;
3437         int index = 0;
3438         long timeout;
3439         int ret = 0;
3440
3441         /*
3442          * By default timeout for non compute jobs is 10000
3443          * and 60000 for compute jobs.
3444          * In SR-IOV or passthrough mode, timeout for compute
3445          * jobs are 60000 by default.
3446          */
3447         adev->gfx_timeout = msecs_to_jiffies(10000);
3448         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3449         if (amdgpu_sriov_vf(adev))
3450                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3451                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3452         else
3453                 adev->compute_timeout =  msecs_to_jiffies(60000);
3454
3455         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3456                 while ((timeout_setting = strsep(&input, ",")) &&
3457                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3458                         ret = kstrtol(timeout_setting, 0, &timeout);
3459                         if (ret)
3460                                 return ret;
3461
3462                         if (timeout == 0) {
3463                                 index++;
3464                                 continue;
3465                         } else if (timeout < 0) {
3466                                 timeout = MAX_SCHEDULE_TIMEOUT;
3467                                 dev_warn(adev->dev, "lockup timeout disabled");
3468                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3469                         } else {
3470                                 timeout = msecs_to_jiffies(timeout);
3471                         }
3472
3473                         switch (index++) {
3474                         case 0:
3475                                 adev->gfx_timeout = timeout;
3476                                 break;
3477                         case 1:
3478                                 adev->compute_timeout = timeout;
3479                                 break;
3480                         case 2:
3481                                 adev->sdma_timeout = timeout;
3482                                 break;
3483                         case 3:
3484                                 adev->video_timeout = timeout;
3485                                 break;
3486                         default:
3487                                 break;
3488                         }
3489                 }
3490                 /*
3491                  * There is only one value specified and
3492                  * it should apply to all non-compute jobs.
3493                  */
3494                 if (index == 1) {
3495                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3496                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3497                                 adev->compute_timeout = adev->gfx_timeout;
3498                 }
3499         }
3500
3501         return ret;
3502 }
3503
3504 /**
3505  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3506  *
3507  * @adev: amdgpu_device pointer
3508  *
3509  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3510  */
3511 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3512 {
3513         struct iommu_domain *domain;
3514
3515         domain = iommu_get_domain_for_dev(adev->dev);
3516         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3517                 adev->ram_is_direct_mapped = true;
3518 }
3519
3520 static const struct attribute *amdgpu_dev_attributes[] = {
3521         &dev_attr_product_name.attr,
3522         &dev_attr_product_number.attr,
3523         &dev_attr_serial_number.attr,
3524         &dev_attr_pcie_replay_count.attr,
3525         NULL
3526 };
3527
3528 /**
3529  * amdgpu_device_init - initialize the driver
3530  *
3531  * @adev: amdgpu_device pointer
3532  * @flags: driver flags
3533  *
3534  * Initializes the driver info and hw (all asics).
3535  * Returns 0 for success or an error on failure.
3536  * Called at driver startup.
3537  */
3538 int amdgpu_device_init(struct amdgpu_device *adev,
3539                        uint32_t flags)
3540 {
3541         struct drm_device *ddev = adev_to_drm(adev);
3542         struct pci_dev *pdev = adev->pdev;
3543         int r, i;
3544         bool px = false;
3545         u32 max_MBps;
3546
3547         adev->shutdown = false;
3548         adev->flags = flags;
3549
3550         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3551                 adev->asic_type = amdgpu_force_asic_type;
3552         else
3553                 adev->asic_type = flags & AMD_ASIC_MASK;
3554
3555         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3556         if (amdgpu_emu_mode == 1)
3557                 adev->usec_timeout *= 10;
3558         adev->gmc.gart_size = 512 * 1024 * 1024;
3559         adev->accel_working = false;
3560         adev->num_rings = 0;
3561         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3562         adev->mman.buffer_funcs = NULL;
3563         adev->mman.buffer_funcs_ring = NULL;
3564         adev->vm_manager.vm_pte_funcs = NULL;
3565         adev->vm_manager.vm_pte_num_scheds = 0;
3566         adev->gmc.gmc_funcs = NULL;
3567         adev->harvest_ip_mask = 0x0;
3568         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3569         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3570
3571         adev->smc_rreg = &amdgpu_invalid_rreg;
3572         adev->smc_wreg = &amdgpu_invalid_wreg;
3573         adev->pcie_rreg = &amdgpu_invalid_rreg;
3574         adev->pcie_wreg = &amdgpu_invalid_wreg;
3575         adev->pciep_rreg = &amdgpu_invalid_rreg;
3576         adev->pciep_wreg = &amdgpu_invalid_wreg;
3577         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3578         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3579         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3580         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3581         adev->didt_rreg = &amdgpu_invalid_rreg;
3582         adev->didt_wreg = &amdgpu_invalid_wreg;
3583         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3584         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3585         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3586         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3587
3588         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3589                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3590                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3591
3592         /* mutex initialization are all done here so we
3593          * can recall function without having locking issues */
3594         mutex_init(&adev->firmware.mutex);
3595         mutex_init(&adev->pm.mutex);
3596         mutex_init(&adev->gfx.gpu_clock_mutex);
3597         mutex_init(&adev->srbm_mutex);
3598         mutex_init(&adev->gfx.pipe_reserve_mutex);
3599         mutex_init(&adev->gfx.gfx_off_mutex);
3600         mutex_init(&adev->grbm_idx_mutex);
3601         mutex_init(&adev->mn_lock);
3602         mutex_init(&adev->virt.vf_errors.lock);
3603         hash_init(adev->mn_hash);
3604         mutex_init(&adev->psp.mutex);
3605         mutex_init(&adev->notifier_lock);
3606         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3607         mutex_init(&adev->benchmark_mutex);
3608
3609         amdgpu_device_init_apu_flags(adev);
3610
3611         r = amdgpu_device_check_arguments(adev);
3612         if (r)
3613                 return r;
3614
3615         spin_lock_init(&adev->mmio_idx_lock);
3616         spin_lock_init(&adev->smc_idx_lock);
3617         spin_lock_init(&adev->pcie_idx_lock);
3618         spin_lock_init(&adev->uvd_ctx_idx_lock);
3619         spin_lock_init(&adev->didt_idx_lock);
3620         spin_lock_init(&adev->gc_cac_idx_lock);
3621         spin_lock_init(&adev->se_cac_idx_lock);
3622         spin_lock_init(&adev->audio_endpt_idx_lock);
3623         spin_lock_init(&adev->mm_stats.lock);
3624
3625         INIT_LIST_HEAD(&adev->shadow_list);
3626         mutex_init(&adev->shadow_list_lock);
3627
3628         INIT_LIST_HEAD(&adev->reset_list);
3629
3630         INIT_LIST_HEAD(&adev->ras_list);
3631
3632         INIT_DELAYED_WORK(&adev->delayed_init_work,
3633                           amdgpu_device_delayed_init_work_handler);
3634         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3635                           amdgpu_device_delay_enable_gfx_off);
3636
3637         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3638
3639         adev->gfx.gfx_off_req_count = 1;
3640         adev->gfx.gfx_off_residency = 0;
3641         adev->gfx.gfx_off_entrycount = 0;
3642         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3643
3644         atomic_set(&adev->throttling_logging_enabled, 1);
3645         /*
3646          * If throttling continues, logging will be performed every minute
3647          * to avoid log flooding. "-1" is subtracted since the thermal
3648          * throttling interrupt comes every second. Thus, the total logging
3649          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3650          * for throttling interrupt) = 60 seconds.
3651          */
3652         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3653         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3654
3655         /* Registers mapping */
3656         /* TODO: block userspace mapping of io register */
3657         if (adev->asic_type >= CHIP_BONAIRE) {
3658                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3659                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3660         } else {
3661                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3662                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3663         }
3664
3665         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3666                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3667
3668         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3669         if (adev->rmmio == NULL) {
3670                 return -ENOMEM;
3671         }
3672         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3673         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3674
3675         amdgpu_device_get_pcie_info(adev);
3676
3677         if (amdgpu_mcbp)
3678                 DRM_INFO("MCBP is enabled\n");
3679
3680         /*
3681          * Reset domain needs to be present early, before XGMI hive discovered
3682          * (if any) and intitialized to use reset sem and in_gpu reset flag
3683          * early on during init and before calling to RREG32.
3684          */
3685         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3686         if (!adev->reset_domain)
3687                 return -ENOMEM;
3688
3689         /* detect hw virtualization here */
3690         amdgpu_detect_virtualization(adev);
3691
3692         r = amdgpu_device_get_job_timeout_settings(adev);
3693         if (r) {
3694                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3695                 return r;
3696         }
3697
3698         /* early init functions */
3699         r = amdgpu_device_ip_early_init(adev);
3700         if (r)
3701                 return r;
3702
3703         /* Get rid of things like offb */
3704         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3705         if (r)
3706                 return r;
3707
3708         /* Enable TMZ based on IP_VERSION */
3709         amdgpu_gmc_tmz_set(adev);
3710
3711         amdgpu_gmc_noretry_set(adev);
3712         /* Need to get xgmi info early to decide the reset behavior*/
3713         if (adev->gmc.xgmi.supported) {
3714                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3715                 if (r)
3716                         return r;
3717         }
3718
3719         /* enable PCIE atomic ops */
3720         if (amdgpu_sriov_vf(adev))
3721                 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3722                         adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3723                         (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3724         else
3725                 adev->have_atomics_support =
3726                         !pci_enable_atomic_ops_to_root(adev->pdev,
3727                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3728                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3729         if (!adev->have_atomics_support)
3730                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3731
3732         /* doorbell bar mapping and doorbell index init*/
3733         amdgpu_device_doorbell_init(adev);
3734
3735         if (amdgpu_emu_mode == 1) {
3736                 /* post the asic on emulation mode */
3737                 emu_soc_asic_init(adev);
3738                 goto fence_driver_init;
3739         }
3740
3741         amdgpu_reset_init(adev);
3742
3743         /* detect if we are with an SRIOV vbios */
3744         amdgpu_device_detect_sriov_bios(adev);
3745
3746         /* check if we need to reset the asic
3747          *  E.g., driver was not cleanly unloaded previously, etc.
3748          */
3749         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3750                 if (adev->gmc.xgmi.num_physical_nodes) {
3751                         dev_info(adev->dev, "Pending hive reset.\n");
3752                         adev->gmc.xgmi.pending_reset = true;
3753                         /* Only need to init necessary block for SMU to handle the reset */
3754                         for (i = 0; i < adev->num_ip_blocks; i++) {
3755                                 if (!adev->ip_blocks[i].status.valid)
3756                                         continue;
3757                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3758                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3759                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3760                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3761                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3762                                                 adev->ip_blocks[i].version->funcs->name);
3763                                         adev->ip_blocks[i].status.hw = true;
3764                                 }
3765                         }
3766                 } else {
3767                         r = amdgpu_asic_reset(adev);
3768                         if (r) {
3769                                 dev_err(adev->dev, "asic reset on init failed\n");
3770                                 goto failed;
3771                         }
3772                 }
3773         }
3774
3775         pci_enable_pcie_error_reporting(adev->pdev);
3776
3777         /* Post card if necessary */
3778         if (amdgpu_device_need_post(adev)) {
3779                 if (!adev->bios) {
3780                         dev_err(adev->dev, "no vBIOS found\n");
3781                         r = -EINVAL;
3782                         goto failed;
3783                 }
3784                 DRM_INFO("GPU posting now...\n");
3785                 r = amdgpu_device_asic_init(adev);
3786                 if (r) {
3787                         dev_err(adev->dev, "gpu post error!\n");
3788                         goto failed;
3789                 }
3790         }
3791
3792         if (adev->is_atom_fw) {
3793                 /* Initialize clocks */
3794                 r = amdgpu_atomfirmware_get_clock_info(adev);
3795                 if (r) {
3796                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3797                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3798                         goto failed;
3799                 }
3800         } else {
3801                 /* Initialize clocks */
3802                 r = amdgpu_atombios_get_clock_info(adev);
3803                 if (r) {
3804                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3805                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3806                         goto failed;
3807                 }
3808                 /* init i2c buses */
3809                 if (!amdgpu_device_has_dc_support(adev))
3810                         amdgpu_atombios_i2c_init(adev);
3811         }
3812
3813 fence_driver_init:
3814         /* Fence driver */
3815         r = amdgpu_fence_driver_sw_init(adev);
3816         if (r) {
3817                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3818                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3819                 goto failed;
3820         }
3821
3822         /* init the mode config */
3823         drm_mode_config_init(adev_to_drm(adev));
3824
3825         r = amdgpu_device_ip_init(adev);
3826         if (r) {
3827                 /* failed in exclusive mode due to timeout */
3828                 if (amdgpu_sriov_vf(adev) &&
3829                     !amdgpu_sriov_runtime(adev) &&
3830                     amdgpu_virt_mmio_blocked(adev) &&
3831                     !amdgpu_virt_wait_reset(adev)) {
3832                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3833                         /* Don't send request since VF is inactive. */
3834                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3835                         adev->virt.ops = NULL;
3836                         r = -EAGAIN;
3837                         goto release_ras_con;
3838                 }
3839                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3840                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3841                 goto release_ras_con;
3842         }
3843
3844         amdgpu_fence_driver_hw_init(adev);
3845
3846         dev_info(adev->dev,
3847                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3848                         adev->gfx.config.max_shader_engines,
3849                         adev->gfx.config.max_sh_per_se,
3850                         adev->gfx.config.max_cu_per_sh,
3851                         adev->gfx.cu_info.number);
3852
3853         adev->accel_working = true;
3854
3855         amdgpu_vm_check_compute_bug(adev);
3856
3857         /* Initialize the buffer migration limit. */
3858         if (amdgpu_moverate >= 0)
3859                 max_MBps = amdgpu_moverate;
3860         else
3861                 max_MBps = 8; /* Allow 8 MB/s. */
3862         /* Get a log2 for easy divisions. */
3863         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3864
3865         r = amdgpu_pm_sysfs_init(adev);
3866         if (r) {
3867                 adev->pm_sysfs_en = false;
3868                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3869         } else
3870                 adev->pm_sysfs_en = true;
3871
3872         r = amdgpu_ucode_sysfs_init(adev);
3873         if (r) {
3874                 adev->ucode_sysfs_en = false;
3875                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3876         } else
3877                 adev->ucode_sysfs_en = true;
3878
3879         r = amdgpu_psp_sysfs_init(adev);
3880         if (r) {
3881                 adev->psp_sysfs_en = false;
3882                 if (!amdgpu_sriov_vf(adev))
3883                         DRM_ERROR("Creating psp sysfs failed\n");
3884         } else
3885                 adev->psp_sysfs_en = true;
3886
3887         /*
3888          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3889          * Otherwise the mgpu fan boost feature will be skipped due to the
3890          * gpu instance is counted less.
3891          */
3892         amdgpu_register_gpu_instance(adev);
3893
3894         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3895          * explicit gating rather than handling it automatically.
3896          */
3897         if (!adev->gmc.xgmi.pending_reset) {
3898                 r = amdgpu_device_ip_late_init(adev);
3899                 if (r) {
3900                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3901                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3902                         goto release_ras_con;
3903                 }
3904                 /* must succeed. */
3905                 amdgpu_ras_resume(adev);
3906                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3907                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3908         }
3909
3910         if (amdgpu_sriov_vf(adev))
3911                 flush_delayed_work(&adev->delayed_init_work);
3912
3913         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3914         if (r)
3915                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3916
3917         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3918                 r = amdgpu_pmu_init(adev);
3919         if (r)
3920                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3921
3922         /* Have stored pci confspace at hand for restore in sudden PCI error */
3923         if (amdgpu_device_cache_pci_state(adev->pdev))
3924                 pci_restore_state(pdev);
3925
3926         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3927         /* this will fail for cards that aren't VGA class devices, just
3928          * ignore it */
3929         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3930                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3931
3932         if (amdgpu_device_supports_px(ddev)) {
3933                 px = true;
3934                 vga_switcheroo_register_client(adev->pdev,
3935                                                &amdgpu_switcheroo_ops, px);
3936                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3937         }
3938
3939         if (adev->gmc.xgmi.pending_reset)
3940                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3941                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3942
3943         amdgpu_device_check_iommu_direct_map(adev);
3944
3945         return 0;
3946
3947 release_ras_con:
3948         amdgpu_release_ras_context(adev);
3949
3950 failed:
3951         amdgpu_vf_error_trans_all(adev);
3952
3953         return r;
3954 }
3955
3956 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3957 {
3958
3959         /* Clear all CPU mappings pointing to this device */
3960         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3961
3962         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3963         amdgpu_device_doorbell_fini(adev);
3964
3965         iounmap(adev->rmmio);
3966         adev->rmmio = NULL;
3967         if (adev->mman.aper_base_kaddr)
3968                 iounmap(adev->mman.aper_base_kaddr);
3969         adev->mman.aper_base_kaddr = NULL;
3970
3971         /* Memory manager related */
3972         if (!adev->gmc.xgmi.connected_to_cpu) {
3973                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3974                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3975         }
3976 }
3977
3978 /**
3979  * amdgpu_device_fini_hw - tear down the driver
3980  *
3981  * @adev: amdgpu_device pointer
3982  *
3983  * Tear down the driver info (all asics).
3984  * Called at driver shutdown.
3985  */
3986 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3987 {
3988         dev_info(adev->dev, "amdgpu: finishing device.\n");
3989         flush_delayed_work(&adev->delayed_init_work);
3990         adev->shutdown = true;
3991
3992         /* make sure IB test finished before entering exclusive mode
3993          * to avoid preemption on IB test
3994          * */
3995         if (amdgpu_sriov_vf(adev)) {
3996                 amdgpu_virt_request_full_gpu(adev, false);
3997                 amdgpu_virt_fini_data_exchange(adev);
3998         }
3999
4000         /* disable all interrupts */
4001         amdgpu_irq_disable_all(adev);
4002         if (adev->mode_info.mode_config_initialized){
4003                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4004                         drm_helper_force_disable_all(adev_to_drm(adev));
4005                 else
4006                         drm_atomic_helper_shutdown(adev_to_drm(adev));
4007         }
4008         amdgpu_fence_driver_hw_fini(adev);
4009
4010         if (adev->mman.initialized) {
4011                 flush_delayed_work(&adev->mman.bdev.wq);
4012                 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
4013         }
4014
4015         if (adev->pm_sysfs_en)
4016                 amdgpu_pm_sysfs_fini(adev);
4017         if (adev->ucode_sysfs_en)
4018                 amdgpu_ucode_sysfs_fini(adev);
4019         if (adev->psp_sysfs_en)
4020                 amdgpu_psp_sysfs_fini(adev);
4021         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4022
4023         /* disable ras feature must before hw fini */
4024         amdgpu_ras_pre_fini(adev);
4025
4026         amdgpu_device_ip_fini_early(adev);
4027
4028         amdgpu_irq_fini_hw(adev);
4029
4030         if (adev->mman.initialized)
4031                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4032
4033         amdgpu_gart_dummy_page_fini(adev);
4034
4035         amdgpu_device_unmap_mmio(adev);
4036
4037 }
4038
4039 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4040 {
4041         int idx;
4042
4043         amdgpu_fence_driver_sw_fini(adev);
4044         amdgpu_device_ip_fini(adev);
4045         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4046         adev->accel_working = false;
4047         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4048
4049         amdgpu_reset_fini(adev);
4050
4051         /* free i2c buses */
4052         if (!amdgpu_device_has_dc_support(adev))
4053                 amdgpu_i2c_fini(adev);
4054
4055         if (amdgpu_emu_mode != 1)
4056                 amdgpu_atombios_fini(adev);
4057
4058         kfree(adev->bios);
4059         adev->bios = NULL;
4060         if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4061                 vga_switcheroo_unregister_client(adev->pdev);
4062                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4063         }
4064         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4065                 vga_client_unregister(adev->pdev);
4066
4067         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4068
4069                 iounmap(adev->rmmio);
4070                 adev->rmmio = NULL;
4071                 amdgpu_device_doorbell_fini(adev);
4072                 drm_dev_exit(idx);
4073         }
4074
4075         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4076                 amdgpu_pmu_fini(adev);
4077         if (adev->mman.discovery_bin)
4078                 amdgpu_discovery_fini(adev);
4079
4080         amdgpu_reset_put_reset_domain(adev->reset_domain);
4081         adev->reset_domain = NULL;
4082
4083         kfree(adev->pci_state);
4084
4085 }
4086
4087 /**
4088  * amdgpu_device_evict_resources - evict device resources
4089  * @adev: amdgpu device object
4090  *
4091  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4092  * of the vram memory type. Mainly used for evicting device resources
4093  * at suspend time.
4094  *
4095  */
4096 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4097 {
4098         int ret;
4099
4100         /* No need to evict vram on APUs for suspend to ram or s2idle */
4101         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4102                 return 0;
4103
4104         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4105         if (ret)
4106                 DRM_WARN("evicting device resources failed\n");
4107         return ret;
4108 }
4109
4110 /*
4111  * Suspend & resume.
4112  */
4113 /**
4114  * amdgpu_device_suspend - initiate device suspend
4115  *
4116  * @dev: drm dev pointer
4117  * @fbcon : notify the fbdev of suspend
4118  *
4119  * Puts the hw in the suspend state (all asics).
4120  * Returns 0 for success or an error on failure.
4121  * Called at driver suspend.
4122  */
4123 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4124 {
4125         struct amdgpu_device *adev = drm_to_adev(dev);
4126         int r = 0;
4127
4128         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4129                 return 0;
4130
4131         adev->in_suspend = true;
4132
4133         /* Evict the majority of BOs before grabbing the full access */
4134         r = amdgpu_device_evict_resources(adev);
4135         if (r)
4136                 return r;
4137
4138         if (amdgpu_sriov_vf(adev)) {
4139                 amdgpu_virt_fini_data_exchange(adev);
4140                 r = amdgpu_virt_request_full_gpu(adev, false);
4141                 if (r)
4142                         return r;
4143         }
4144
4145         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4146                 DRM_WARN("smart shift update failed\n");
4147
4148         drm_kms_helper_poll_disable(dev);
4149
4150         if (fbcon)
4151                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4152
4153         cancel_delayed_work_sync(&adev->delayed_init_work);
4154
4155         amdgpu_ras_suspend(adev);
4156
4157         amdgpu_device_ip_suspend_phase1(adev);
4158
4159         if (!adev->in_s0ix)
4160                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4161
4162         r = amdgpu_device_evict_resources(adev);
4163         if (r)
4164                 return r;
4165
4166         amdgpu_fence_driver_hw_fini(adev);
4167
4168         amdgpu_device_ip_suspend_phase2(adev);
4169
4170         if (amdgpu_sriov_vf(adev))
4171                 amdgpu_virt_release_full_gpu(adev, false);
4172
4173         return 0;
4174 }
4175
4176 /**
4177  * amdgpu_device_resume - initiate device resume
4178  *
4179  * @dev: drm dev pointer
4180  * @fbcon : notify the fbdev of resume
4181  *
4182  * Bring the hw back to operating state (all asics).
4183  * Returns 0 for success or an error on failure.
4184  * Called at driver resume.
4185  */
4186 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4187 {
4188         struct amdgpu_device *adev = drm_to_adev(dev);
4189         int r = 0;
4190
4191         if (amdgpu_sriov_vf(adev)) {
4192                 r = amdgpu_virt_request_full_gpu(adev, true);
4193                 if (r)
4194                         return r;
4195         }
4196
4197         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4198                 return 0;
4199
4200         if (adev->in_s0ix)
4201                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4202
4203         /* post card */
4204         if (amdgpu_device_need_post(adev)) {
4205                 r = amdgpu_device_asic_init(adev);
4206                 if (r)
4207                         dev_err(adev->dev, "amdgpu asic init failed\n");
4208         }
4209
4210         r = amdgpu_device_ip_resume(adev);
4211
4212         if (r) {
4213                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4214                 goto exit;
4215         }
4216         amdgpu_fence_driver_hw_init(adev);
4217
4218         r = amdgpu_device_ip_late_init(adev);
4219         if (r)
4220                 goto exit;
4221
4222         queue_delayed_work(system_wq, &adev->delayed_init_work,
4223                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4224
4225         if (!adev->in_s0ix) {
4226                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4227                 if (r)
4228                         goto exit;
4229         }
4230
4231 exit:
4232         if (amdgpu_sriov_vf(adev)) {
4233                 amdgpu_virt_init_data_exchange(adev);
4234                 amdgpu_virt_release_full_gpu(adev, true);
4235         }
4236
4237         if (r)
4238                 return r;
4239
4240         /* Make sure IB tests flushed */
4241         flush_delayed_work(&adev->delayed_init_work);
4242
4243         if (fbcon)
4244                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4245
4246         drm_kms_helper_poll_enable(dev);
4247
4248         amdgpu_ras_resume(adev);
4249
4250         if (adev->mode_info.num_crtc) {
4251                 /*
4252                  * Most of the connector probing functions try to acquire runtime pm
4253                  * refs to ensure that the GPU is powered on when connector polling is
4254                  * performed. Since we're calling this from a runtime PM callback,
4255                  * trying to acquire rpm refs will cause us to deadlock.
4256                  *
4257                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4258                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4259                  */
4260 #ifdef CONFIG_PM
4261                 dev->dev->power.disable_depth++;
4262 #endif
4263                 if (!adev->dc_enabled)
4264                         drm_helper_hpd_irq_event(dev);
4265                 else
4266                         drm_kms_helper_hotplug_event(dev);
4267 #ifdef CONFIG_PM
4268                 dev->dev->power.disable_depth--;
4269 #endif
4270         }
4271         adev->in_suspend = false;
4272
4273         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4274                 DRM_WARN("smart shift update failed\n");
4275
4276         return 0;
4277 }
4278
4279 /**
4280  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4281  *
4282  * @adev: amdgpu_device pointer
4283  *
4284  * The list of all the hardware IPs that make up the asic is walked and
4285  * the check_soft_reset callbacks are run.  check_soft_reset determines
4286  * if the asic is still hung or not.
4287  * Returns true if any of the IPs are still in a hung state, false if not.
4288  */
4289 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4290 {
4291         int i;
4292         bool asic_hang = false;
4293
4294         if (amdgpu_sriov_vf(adev))
4295                 return true;
4296
4297         if (amdgpu_asic_need_full_reset(adev))
4298                 return true;
4299
4300         for (i = 0; i < adev->num_ip_blocks; i++) {
4301                 if (!adev->ip_blocks[i].status.valid)
4302                         continue;
4303                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4304                         adev->ip_blocks[i].status.hang =
4305                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4306                 if (adev->ip_blocks[i].status.hang) {
4307                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4308                         asic_hang = true;
4309                 }
4310         }
4311         return asic_hang;
4312 }
4313
4314 /**
4315  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4316  *
4317  * @adev: amdgpu_device pointer
4318  *
4319  * The list of all the hardware IPs that make up the asic is walked and the
4320  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4321  * handles any IP specific hardware or software state changes that are
4322  * necessary for a soft reset to succeed.
4323  * Returns 0 on success, negative error code on failure.
4324  */
4325 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4326 {
4327         int i, r = 0;
4328
4329         for (i = 0; i < adev->num_ip_blocks; i++) {
4330                 if (!adev->ip_blocks[i].status.valid)
4331                         continue;
4332                 if (adev->ip_blocks[i].status.hang &&
4333                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4334                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4335                         if (r)
4336                                 return r;
4337                 }
4338         }
4339
4340         return 0;
4341 }
4342
4343 /**
4344  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4345  *
4346  * @adev: amdgpu_device pointer
4347  *
4348  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4349  * reset is necessary to recover.
4350  * Returns true if a full asic reset is required, false if not.
4351  */
4352 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4353 {
4354         int i;
4355
4356         if (amdgpu_asic_need_full_reset(adev))
4357                 return true;
4358
4359         for (i = 0; i < adev->num_ip_blocks; i++) {
4360                 if (!adev->ip_blocks[i].status.valid)
4361                         continue;
4362                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4363                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4364                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4365                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4366                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4367                         if (adev->ip_blocks[i].status.hang) {
4368                                 dev_info(adev->dev, "Some block need full reset!\n");
4369                                 return true;
4370                         }
4371                 }
4372         }
4373         return false;
4374 }
4375
4376 /**
4377  * amdgpu_device_ip_soft_reset - do a soft reset
4378  *
4379  * @adev: amdgpu_device pointer
4380  *
4381  * The list of all the hardware IPs that make up the asic is walked and the
4382  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4383  * IP specific hardware or software state changes that are necessary to soft
4384  * reset the IP.
4385  * Returns 0 on success, negative error code on failure.
4386  */
4387 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4388 {
4389         int i, r = 0;
4390
4391         for (i = 0; i < adev->num_ip_blocks; i++) {
4392                 if (!adev->ip_blocks[i].status.valid)
4393                         continue;
4394                 if (adev->ip_blocks[i].status.hang &&
4395                     adev->ip_blocks[i].version->funcs->soft_reset) {
4396                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4397                         if (r)
4398                                 return r;
4399                 }
4400         }
4401
4402         return 0;
4403 }
4404
4405 /**
4406  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4407  *
4408  * @adev: amdgpu_device pointer
4409  *
4410  * The list of all the hardware IPs that make up the asic is walked and the
4411  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4412  * handles any IP specific hardware or software state changes that are
4413  * necessary after the IP has been soft reset.
4414  * Returns 0 on success, negative error code on failure.
4415  */
4416 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4417 {
4418         int i, r = 0;
4419
4420         for (i = 0; i < adev->num_ip_blocks; i++) {
4421                 if (!adev->ip_blocks[i].status.valid)
4422                         continue;
4423                 if (adev->ip_blocks[i].status.hang &&
4424                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4425                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4426                 if (r)
4427                         return r;
4428         }
4429
4430         return 0;
4431 }
4432
4433 /**
4434  * amdgpu_device_recover_vram - Recover some VRAM contents
4435  *
4436  * @adev: amdgpu_device pointer
4437  *
4438  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4439  * restore things like GPUVM page tables after a GPU reset where
4440  * the contents of VRAM might be lost.
4441  *
4442  * Returns:
4443  * 0 on success, negative error code on failure.
4444  */
4445 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4446 {
4447         struct dma_fence *fence = NULL, *next = NULL;
4448         struct amdgpu_bo *shadow;
4449         struct amdgpu_bo_vm *vmbo;
4450         long r = 1, tmo;
4451
4452         if (amdgpu_sriov_runtime(adev))
4453                 tmo = msecs_to_jiffies(8000);
4454         else
4455                 tmo = msecs_to_jiffies(100);
4456
4457         dev_info(adev->dev, "recover vram bo from shadow start\n");
4458         mutex_lock(&adev->shadow_list_lock);
4459         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4460                 shadow = &vmbo->bo;
4461                 /* No need to recover an evicted BO */
4462                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4463                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4464                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4465                         continue;
4466
4467                 r = amdgpu_bo_restore_shadow(shadow, &next);
4468                 if (r)
4469                         break;
4470
4471                 if (fence) {
4472                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4473                         dma_fence_put(fence);
4474                         fence = next;
4475                         if (tmo == 0) {
4476                                 r = -ETIMEDOUT;
4477                                 break;
4478                         } else if (tmo < 0) {
4479                                 r = tmo;
4480                                 break;
4481                         }
4482                 } else {
4483                         fence = next;
4484                 }
4485         }
4486         mutex_unlock(&adev->shadow_list_lock);
4487
4488         if (fence)
4489                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4490         dma_fence_put(fence);
4491
4492         if (r < 0 || tmo <= 0) {
4493                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4494                 return -EIO;
4495         }
4496
4497         dev_info(adev->dev, "recover vram bo from shadow done\n");
4498         return 0;
4499 }
4500
4501
4502 /**
4503  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4504  *
4505  * @adev: amdgpu_device pointer
4506  * @from_hypervisor: request from hypervisor
4507  *
4508  * do VF FLR and reinitialize Asic
4509  * return 0 means succeeded otherwise failed
4510  */
4511 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4512                                      bool from_hypervisor)
4513 {
4514         int r;
4515         struct amdgpu_hive_info *hive = NULL;
4516         int retry_limit = 0;
4517
4518 retry:
4519         amdgpu_amdkfd_pre_reset(adev);
4520
4521         if (from_hypervisor)
4522                 r = amdgpu_virt_request_full_gpu(adev, true);
4523         else
4524                 r = amdgpu_virt_reset_gpu(adev);
4525         if (r)
4526                 return r;
4527
4528         /* Resume IP prior to SMC */
4529         r = amdgpu_device_ip_reinit_early_sriov(adev);
4530         if (r)
4531                 goto error;
4532
4533         amdgpu_virt_init_data_exchange(adev);
4534
4535         r = amdgpu_device_fw_loading(adev);
4536         if (r)
4537                 return r;
4538
4539         /* now we are okay to resume SMC/CP/SDMA */
4540         r = amdgpu_device_ip_reinit_late_sriov(adev);
4541         if (r)
4542                 goto error;
4543
4544         hive = amdgpu_get_xgmi_hive(adev);
4545         /* Update PSP FW topology after reset */
4546         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4547                 r = amdgpu_xgmi_update_topology(hive, adev);
4548
4549         if (hive)
4550                 amdgpu_put_xgmi_hive(hive);
4551
4552         if (!r) {
4553                 amdgpu_irq_gpu_reset_resume_helper(adev);
4554                 r = amdgpu_ib_ring_tests(adev);
4555
4556                 amdgpu_amdkfd_post_reset(adev);
4557         }
4558
4559 error:
4560         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4561                 amdgpu_inc_vram_lost(adev);
4562                 r = amdgpu_device_recover_vram(adev);
4563         }
4564         amdgpu_virt_release_full_gpu(adev, true);
4565
4566         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4567                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4568                         retry_limit++;
4569                         goto retry;
4570                 } else
4571                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4572         }
4573
4574         return r;
4575 }
4576
4577 /**
4578  * amdgpu_device_has_job_running - check if there is any job in mirror list
4579  *
4580  * @adev: amdgpu_device pointer
4581  *
4582  * check if there is any job in mirror list
4583  */
4584 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4585 {
4586         int i;
4587         struct drm_sched_job *job;
4588
4589         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4590                 struct amdgpu_ring *ring = adev->rings[i];
4591
4592                 if (!ring || !ring->sched.thread)
4593                         continue;
4594
4595                 spin_lock(&ring->sched.job_list_lock);
4596                 job = list_first_entry_or_null(&ring->sched.pending_list,
4597                                                struct drm_sched_job, list);
4598                 spin_unlock(&ring->sched.job_list_lock);
4599                 if (job)
4600                         return true;
4601         }
4602         return false;
4603 }
4604
4605 /**
4606  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4607  *
4608  * @adev: amdgpu_device pointer
4609  *
4610  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4611  * a hung GPU.
4612  */
4613 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4614 {
4615
4616         if (amdgpu_gpu_recovery == 0)
4617                 goto disabled;
4618
4619         /* Skip soft reset check in fatal error mode */
4620         if (!amdgpu_ras_is_poison_mode_supported(adev))
4621                 return true;
4622
4623         if (amdgpu_sriov_vf(adev))
4624                 return true;
4625
4626         if (amdgpu_gpu_recovery == -1) {
4627                 switch (adev->asic_type) {
4628 #ifdef CONFIG_DRM_AMDGPU_SI
4629                 case CHIP_VERDE:
4630                 case CHIP_TAHITI:
4631                 case CHIP_PITCAIRN:
4632                 case CHIP_OLAND:
4633                 case CHIP_HAINAN:
4634 #endif
4635 #ifdef CONFIG_DRM_AMDGPU_CIK
4636                 case CHIP_KAVERI:
4637                 case CHIP_KABINI:
4638                 case CHIP_MULLINS:
4639 #endif
4640                 case CHIP_CARRIZO:
4641                 case CHIP_STONEY:
4642                 case CHIP_CYAN_SKILLFISH:
4643                         goto disabled;
4644                 default:
4645                         break;
4646                 }
4647         }
4648
4649         return true;
4650
4651 disabled:
4652                 dev_info(adev->dev, "GPU recovery disabled.\n");
4653                 return false;
4654 }
4655
4656 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4657 {
4658         u32 i;
4659         int ret = 0;
4660
4661         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4662
4663         dev_info(adev->dev, "GPU mode1 reset\n");
4664
4665         /* disable BM */
4666         pci_clear_master(adev->pdev);
4667
4668         amdgpu_device_cache_pci_state(adev->pdev);
4669
4670         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4671                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4672                 ret = amdgpu_dpm_mode1_reset(adev);
4673         } else {
4674                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4675                 ret = psp_gpu_reset(adev);
4676         }
4677
4678         if (ret)
4679                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4680
4681         amdgpu_device_load_pci_state(adev->pdev);
4682
4683         /* wait for asic to come out of reset */
4684         for (i = 0; i < adev->usec_timeout; i++) {
4685                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4686
4687                 if (memsize != 0xffffffff)
4688                         break;
4689                 udelay(1);
4690         }
4691
4692         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4693         return ret;
4694 }
4695
4696 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4697                                  struct amdgpu_reset_context *reset_context)
4698 {
4699         int i, r = 0;
4700         struct amdgpu_job *job = NULL;
4701         bool need_full_reset =
4702                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4703
4704         if (reset_context->reset_req_dev == adev)
4705                 job = reset_context->job;
4706
4707         if (amdgpu_sriov_vf(adev)) {
4708                 /* stop the data exchange thread */
4709                 amdgpu_virt_fini_data_exchange(adev);
4710         }
4711
4712         amdgpu_fence_driver_isr_toggle(adev, true);
4713
4714         /* block all schedulers and reset given job's ring */
4715         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4716                 struct amdgpu_ring *ring = adev->rings[i];
4717
4718                 if (!ring || !ring->sched.thread)
4719                         continue;
4720
4721                 /*clear job fence from fence drv to avoid force_completion
4722                  *leave NULL and vm flush fence in fence drv */
4723                 amdgpu_fence_driver_clear_job_fences(ring);
4724
4725                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4726                 amdgpu_fence_driver_force_completion(ring);
4727         }
4728
4729         amdgpu_fence_driver_isr_toggle(adev, false);
4730
4731         if (job && job->vm)
4732                 drm_sched_increase_karma(&job->base);
4733
4734         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4735         /* If reset handler not implemented, continue; otherwise return */
4736         if (r == -ENOSYS)
4737                 r = 0;
4738         else
4739                 return r;
4740
4741         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4742         if (!amdgpu_sriov_vf(adev)) {
4743
4744                 if (!need_full_reset)
4745                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4746
4747                 if (!need_full_reset && amdgpu_gpu_recovery &&
4748                     amdgpu_device_ip_check_soft_reset(adev)) {
4749                         amdgpu_device_ip_pre_soft_reset(adev);
4750                         r = amdgpu_device_ip_soft_reset(adev);
4751                         amdgpu_device_ip_post_soft_reset(adev);
4752                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4753                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4754                                 need_full_reset = true;
4755                         }
4756                 }
4757
4758                 if (need_full_reset)
4759                         r = amdgpu_device_ip_suspend(adev);
4760                 if (need_full_reset)
4761                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4762                 else
4763                         clear_bit(AMDGPU_NEED_FULL_RESET,
4764                                   &reset_context->flags);
4765         }
4766
4767         return r;
4768 }
4769
4770 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4771 {
4772         int i;
4773
4774         lockdep_assert_held(&adev->reset_domain->sem);
4775
4776         for (i = 0; i < adev->num_regs; i++) {
4777                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4778                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4779                                              adev->reset_dump_reg_value[i]);
4780         }
4781
4782         return 0;
4783 }
4784
4785 #ifdef CONFIG_DEV_COREDUMP
4786 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4787                 size_t count, void *data, size_t datalen)
4788 {
4789         struct drm_printer p;
4790         struct amdgpu_device *adev = data;
4791         struct drm_print_iterator iter;
4792         int i;
4793
4794         iter.data = buffer;
4795         iter.offset = 0;
4796         iter.start = offset;
4797         iter.remain = count;
4798
4799         p = drm_coredump_printer(&iter);
4800
4801         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4802         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4803         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4804         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4805         if (adev->reset_task_info.pid)
4806                 drm_printf(&p, "process_name: %s PID: %d\n",
4807                            adev->reset_task_info.process_name,
4808                            adev->reset_task_info.pid);
4809
4810         if (adev->reset_vram_lost)
4811                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4812         if (adev->num_regs) {
4813                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4814
4815                 for (i = 0; i < adev->num_regs; i++)
4816                         drm_printf(&p, "0x%08x: 0x%08x\n",
4817                                    adev->reset_dump_reg_list[i],
4818                                    adev->reset_dump_reg_value[i]);
4819         }
4820
4821         return count - iter.remain;
4822 }
4823
4824 static void amdgpu_devcoredump_free(void *data)
4825 {
4826 }
4827
4828 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4829 {
4830         struct drm_device *dev = adev_to_drm(adev);
4831
4832         ktime_get_ts64(&adev->reset_time);
4833         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4834                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4835 }
4836 #endif
4837
4838 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4839                          struct amdgpu_reset_context *reset_context)
4840 {
4841         struct amdgpu_device *tmp_adev = NULL;
4842         bool need_full_reset, skip_hw_reset, vram_lost = false;
4843         int r = 0;
4844         bool gpu_reset_for_dev_remove = 0;
4845
4846         /* Try reset handler method first */
4847         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4848                                     reset_list);
4849         amdgpu_reset_reg_dumps(tmp_adev);
4850
4851         reset_context->reset_device_list = device_list_handle;
4852         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4853         /* If reset handler not implemented, continue; otherwise return */
4854         if (r == -ENOSYS)
4855                 r = 0;
4856         else
4857                 return r;
4858
4859         /* Reset handler not implemented, use the default method */
4860         need_full_reset =
4861                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4862         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4863
4864         gpu_reset_for_dev_remove =
4865                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4866                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4867
4868         /*
4869          * ASIC reset has to be done on all XGMI hive nodes ASAP
4870          * to allow proper links negotiation in FW (within 1 sec)
4871          */
4872         if (!skip_hw_reset && need_full_reset) {
4873                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4874                         /* For XGMI run all resets in parallel to speed up the process */
4875                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4876                                 tmp_adev->gmc.xgmi.pending_reset = false;
4877                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4878                                         r = -EALREADY;
4879                         } else
4880                                 r = amdgpu_asic_reset(tmp_adev);
4881
4882                         if (r) {
4883                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4884                                          r, adev_to_drm(tmp_adev)->unique);
4885                                 break;
4886                         }
4887                 }
4888
4889                 /* For XGMI wait for all resets to complete before proceed */
4890                 if (!r) {
4891                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4892                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4893                                         flush_work(&tmp_adev->xgmi_reset_work);
4894                                         r = tmp_adev->asic_reset_res;
4895                                         if (r)
4896                                                 break;
4897                                 }
4898                         }
4899                 }
4900         }
4901
4902         if (!r && amdgpu_ras_intr_triggered()) {
4903                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4904                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4905                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4906                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4907                 }
4908
4909                 amdgpu_ras_intr_cleared();
4910         }
4911
4912         /* Since the mode1 reset affects base ip blocks, the
4913          * phase1 ip blocks need to be resumed. Otherwise there
4914          * will be a BIOS signature error and the psp bootloader
4915          * can't load kdb on the next amdgpu install.
4916          */
4917         if (gpu_reset_for_dev_remove) {
4918                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4919                         amdgpu_device_ip_resume_phase1(tmp_adev);
4920
4921                 goto end;
4922         }
4923
4924         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4925                 if (need_full_reset) {
4926                         /* post card */
4927                         r = amdgpu_device_asic_init(tmp_adev);
4928                         if (r) {
4929                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4930                         } else {
4931                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4932                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4933                                 if (r)
4934                                         goto out;
4935
4936                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4937                                 if (r)
4938                                         goto out;
4939
4940                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4941 #ifdef CONFIG_DEV_COREDUMP
4942                                 tmp_adev->reset_vram_lost = vram_lost;
4943                                 memset(&tmp_adev->reset_task_info, 0,
4944                                                 sizeof(tmp_adev->reset_task_info));
4945                                 if (reset_context->job && reset_context->job->vm)
4946                                         tmp_adev->reset_task_info =
4947                                                 reset_context->job->vm->task_info;
4948                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4949 #endif
4950                                 if (vram_lost) {
4951                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4952                                         amdgpu_inc_vram_lost(tmp_adev);
4953                                 }
4954
4955                                 r = amdgpu_device_fw_loading(tmp_adev);
4956                                 if (r)
4957                                         return r;
4958
4959                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4960                                 if (r)
4961                                         goto out;
4962
4963                                 if (vram_lost)
4964                                         amdgpu_device_fill_reset_magic(tmp_adev);
4965
4966                                 /*
4967                                  * Add this ASIC as tracked as reset was already
4968                                  * complete successfully.
4969                                  */
4970                                 amdgpu_register_gpu_instance(tmp_adev);
4971
4972                                 if (!reset_context->hive &&
4973                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4974                                         amdgpu_xgmi_add_device(tmp_adev);
4975
4976                                 r = amdgpu_device_ip_late_init(tmp_adev);
4977                                 if (r)
4978                                         goto out;
4979
4980                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4981
4982                                 /*
4983                                  * The GPU enters bad state once faulty pages
4984                                  * by ECC has reached the threshold, and ras
4985                                  * recovery is scheduled next. So add one check
4986                                  * here to break recovery if it indeed exceeds
4987                                  * bad page threshold, and remind user to
4988                                  * retire this GPU or setting one bigger
4989                                  * bad_page_threshold value to fix this once
4990                                  * probing driver again.
4991                                  */
4992                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4993                                         /* must succeed. */
4994                                         amdgpu_ras_resume(tmp_adev);
4995                                 } else {
4996                                         r = -EINVAL;
4997                                         goto out;
4998                                 }
4999
5000                                 /* Update PSP FW topology after reset */
5001                                 if (reset_context->hive &&
5002                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5003                                         r = amdgpu_xgmi_update_topology(
5004                                                 reset_context->hive, tmp_adev);
5005                         }
5006                 }
5007
5008 out:
5009                 if (!r) {
5010                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5011                         r = amdgpu_ib_ring_tests(tmp_adev);
5012                         if (r) {
5013                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5014                                 need_full_reset = true;
5015                                 r = -EAGAIN;
5016                                 goto end;
5017                         }
5018                 }
5019
5020                 if (!r)
5021                         r = amdgpu_device_recover_vram(tmp_adev);
5022                 else
5023                         tmp_adev->asic_reset_res = r;
5024         }
5025
5026 end:
5027         if (need_full_reset)
5028                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5029         else
5030                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5031         return r;
5032 }
5033
5034 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5035 {
5036
5037         switch (amdgpu_asic_reset_method(adev)) {
5038         case AMD_RESET_METHOD_MODE1:
5039                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5040                 break;
5041         case AMD_RESET_METHOD_MODE2:
5042                 adev->mp1_state = PP_MP1_STATE_RESET;
5043                 break;
5044         default:
5045                 adev->mp1_state = PP_MP1_STATE_NONE;
5046                 break;
5047         }
5048 }
5049
5050 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5051 {
5052         amdgpu_vf_error_trans_all(adev);
5053         adev->mp1_state = PP_MP1_STATE_NONE;
5054 }
5055
5056 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5057 {
5058         struct pci_dev *p = NULL;
5059
5060         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5061                         adev->pdev->bus->number, 1);
5062         if (p) {
5063                 pm_runtime_enable(&(p->dev));
5064                 pm_runtime_resume(&(p->dev));
5065         }
5066
5067         pci_dev_put(p);
5068 }
5069
5070 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5071 {
5072         enum amd_reset_method reset_method;
5073         struct pci_dev *p = NULL;
5074         u64 expires;
5075
5076         /*
5077          * For now, only BACO and mode1 reset are confirmed
5078          * to suffer the audio issue without proper suspended.
5079          */
5080         reset_method = amdgpu_asic_reset_method(adev);
5081         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5082              (reset_method != AMD_RESET_METHOD_MODE1))
5083                 return -EINVAL;
5084
5085         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5086                         adev->pdev->bus->number, 1);
5087         if (!p)
5088                 return -ENODEV;
5089
5090         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5091         if (!expires)
5092                 /*
5093                  * If we cannot get the audio device autosuspend delay,
5094                  * a fixed 4S interval will be used. Considering 3S is
5095                  * the audio controller default autosuspend delay setting.
5096                  * 4S used here is guaranteed to cover that.
5097                  */
5098                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5099
5100         while (!pm_runtime_status_suspended(&(p->dev))) {
5101                 if (!pm_runtime_suspend(&(p->dev)))
5102                         break;
5103
5104                 if (expires < ktime_get_mono_fast_ns()) {
5105                         dev_warn(adev->dev, "failed to suspend display audio\n");
5106                         pci_dev_put(p);
5107                         /* TODO: abort the succeeding gpu reset? */
5108                         return -ETIMEDOUT;
5109                 }
5110         }
5111
5112         pm_runtime_disable(&(p->dev));
5113
5114         pci_dev_put(p);
5115         return 0;
5116 }
5117
5118 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5119 {
5120         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5121
5122 #if defined(CONFIG_DEBUG_FS)
5123         if (!amdgpu_sriov_vf(adev))
5124                 cancel_work(&adev->reset_work);
5125 #endif
5126
5127         if (adev->kfd.dev)
5128                 cancel_work(&adev->kfd.reset_work);
5129
5130         if (amdgpu_sriov_vf(adev))
5131                 cancel_work(&adev->virt.flr_work);
5132
5133         if (con && adev->ras_enabled)
5134                 cancel_work(&con->recovery_work);
5135
5136 }
5137
5138 /**
5139  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5140  *
5141  * @adev: amdgpu_device pointer
5142  * @job: which job trigger hang
5143  *
5144  * Attempt to reset the GPU if it has hung (all asics).
5145  * Attempt to do soft-reset or full-reset and reinitialize Asic
5146  * Returns 0 for success or an error on failure.
5147  */
5148
5149 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5150                               struct amdgpu_job *job,
5151                               struct amdgpu_reset_context *reset_context)
5152 {
5153         struct list_head device_list, *device_list_handle =  NULL;
5154         bool job_signaled = false;
5155         struct amdgpu_hive_info *hive = NULL;
5156         struct amdgpu_device *tmp_adev = NULL;
5157         int i, r = 0;
5158         bool need_emergency_restart = false;
5159         bool audio_suspended = false;
5160         bool gpu_reset_for_dev_remove = false;
5161
5162         gpu_reset_for_dev_remove =
5163                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5164                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5165
5166         /*
5167          * Special case: RAS triggered and full reset isn't supported
5168          */
5169         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5170
5171         /*
5172          * Flush RAM to disk so that after reboot
5173          * the user can read log and see why the system rebooted.
5174          */
5175         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5176                 DRM_WARN("Emergency reboot.");
5177
5178                 ksys_sync_helper();
5179                 emergency_restart();
5180         }
5181
5182         dev_info(adev->dev, "GPU %s begin!\n",
5183                 need_emergency_restart ? "jobs stop":"reset");
5184
5185         if (!amdgpu_sriov_vf(adev))
5186                 hive = amdgpu_get_xgmi_hive(adev);
5187         if (hive)
5188                 mutex_lock(&hive->hive_lock);
5189
5190         reset_context->job = job;
5191         reset_context->hive = hive;
5192         /*
5193          * Build list of devices to reset.
5194          * In case we are in XGMI hive mode, resort the device list
5195          * to put adev in the 1st position.
5196          */
5197         INIT_LIST_HEAD(&device_list);
5198         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5199                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5200                         list_add_tail(&tmp_adev->reset_list, &device_list);
5201                         if (gpu_reset_for_dev_remove && adev->shutdown)
5202                                 tmp_adev->shutdown = true;
5203                 }
5204                 if (!list_is_first(&adev->reset_list, &device_list))
5205                         list_rotate_to_front(&adev->reset_list, &device_list);
5206                 device_list_handle = &device_list;
5207         } else {
5208                 list_add_tail(&adev->reset_list, &device_list);
5209                 device_list_handle = &device_list;
5210         }
5211
5212         /* We need to lock reset domain only once both for XGMI and single device */
5213         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5214                                     reset_list);
5215         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5216
5217         /* block all schedulers and reset given job's ring */
5218         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5219
5220                 amdgpu_device_set_mp1_state(tmp_adev);
5221
5222                 /*
5223                  * Try to put the audio codec into suspend state
5224                  * before gpu reset started.
5225                  *
5226                  * Due to the power domain of the graphics device
5227                  * is shared with AZ power domain. Without this,
5228                  * we may change the audio hardware from behind
5229                  * the audio driver's back. That will trigger
5230                  * some audio codec errors.
5231                  */
5232                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5233                         audio_suspended = true;
5234
5235                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5236
5237                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5238
5239                 if (!amdgpu_sriov_vf(tmp_adev))
5240                         amdgpu_amdkfd_pre_reset(tmp_adev);
5241
5242                 /*
5243                  * Mark these ASICs to be reseted as untracked first
5244                  * And add them back after reset completed
5245                  */
5246                 amdgpu_unregister_gpu_instance(tmp_adev);
5247
5248                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5249
5250                 /* disable ras on ALL IPs */
5251                 if (!need_emergency_restart &&
5252                       amdgpu_device_ip_need_full_reset(tmp_adev))
5253                         amdgpu_ras_suspend(tmp_adev);
5254
5255                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5256                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5257
5258                         if (!ring || !ring->sched.thread)
5259                                 continue;
5260
5261                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5262
5263                         if (need_emergency_restart)
5264                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5265                 }
5266                 atomic_inc(&tmp_adev->gpu_reset_counter);
5267         }
5268
5269         if (need_emergency_restart)
5270                 goto skip_sched_resume;
5271
5272         /*
5273          * Must check guilty signal here since after this point all old
5274          * HW fences are force signaled.
5275          *
5276          * job->base holds a reference to parent fence
5277          */
5278         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5279                 job_signaled = true;
5280                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5281                 goto skip_hw_reset;
5282         }
5283
5284 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5285         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5286                 if (gpu_reset_for_dev_remove) {
5287                         /* Workaroud for ASICs need to disable SMC first */
5288                         amdgpu_device_smu_fini_early(tmp_adev);
5289                 }
5290                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5291                 /*TODO Should we stop ?*/
5292                 if (r) {
5293                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5294                                   r, adev_to_drm(tmp_adev)->unique);
5295                         tmp_adev->asic_reset_res = r;
5296                 }
5297
5298                 /*
5299                  * Drop all pending non scheduler resets. Scheduler resets
5300                  * were already dropped during drm_sched_stop
5301                  */
5302                 amdgpu_device_stop_pending_resets(tmp_adev);
5303         }
5304
5305         /* Actual ASIC resets if needed.*/
5306         /* Host driver will handle XGMI hive reset for SRIOV */
5307         if (amdgpu_sriov_vf(adev)) {
5308                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5309                 if (r)
5310                         adev->asic_reset_res = r;
5311
5312                 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5313                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5314                         amdgpu_ras_resume(adev);
5315         } else {
5316                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5317                 if (r && r == -EAGAIN)
5318                         goto retry;
5319
5320                 if (!r && gpu_reset_for_dev_remove)
5321                         goto recover_end;
5322         }
5323
5324 skip_hw_reset:
5325
5326         /* Post ASIC reset for all devs .*/
5327         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5328
5329                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5330                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5331
5332                         if (!ring || !ring->sched.thread)
5333                                 continue;
5334
5335                         drm_sched_start(&ring->sched, true);
5336                 }
5337
5338                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5339                         amdgpu_mes_self_test(tmp_adev);
5340
5341                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5342                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5343                 }
5344
5345                 if (tmp_adev->asic_reset_res)
5346                         r = tmp_adev->asic_reset_res;
5347
5348                 tmp_adev->asic_reset_res = 0;
5349
5350                 if (r) {
5351                         /* bad news, how to tell it to userspace ? */
5352                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5353                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5354                 } else {
5355                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5356                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5357                                 DRM_WARN("smart shift update failed\n");
5358                 }
5359         }
5360
5361 skip_sched_resume:
5362         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5363                 /* unlock kfd: SRIOV would do it separately */
5364                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5365                         amdgpu_amdkfd_post_reset(tmp_adev);
5366
5367                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5368                  * need to bring up kfd here if it's not be initialized before
5369                  */
5370                 if (!adev->kfd.init_complete)
5371                         amdgpu_amdkfd_device_init(adev);
5372
5373                 if (audio_suspended)
5374                         amdgpu_device_resume_display_audio(tmp_adev);
5375
5376                 amdgpu_device_unset_mp1_state(tmp_adev);
5377
5378                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5379         }
5380
5381 recover_end:
5382         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5383                                             reset_list);
5384         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5385
5386         if (hive) {
5387                 mutex_unlock(&hive->hive_lock);
5388                 amdgpu_put_xgmi_hive(hive);
5389         }
5390
5391         if (r)
5392                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5393
5394         atomic_set(&adev->reset_domain->reset_res, r);
5395         return r;
5396 }
5397
5398 /**
5399  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5400  *
5401  * @adev: amdgpu_device pointer
5402  *
5403  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5404  * and lanes) of the slot the device is in. Handles APUs and
5405  * virtualized environments where PCIE config space may not be available.
5406  */
5407 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5408 {
5409         struct pci_dev *pdev;
5410         enum pci_bus_speed speed_cap, platform_speed_cap;
5411         enum pcie_link_width platform_link_width;
5412
5413         if (amdgpu_pcie_gen_cap)
5414                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5415
5416         if (amdgpu_pcie_lane_cap)
5417                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5418
5419         /* covers APUs as well */
5420         if (pci_is_root_bus(adev->pdev->bus)) {
5421                 if (adev->pm.pcie_gen_mask == 0)
5422                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5423                 if (adev->pm.pcie_mlw_mask == 0)
5424                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5425                 return;
5426         }
5427
5428         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5429                 return;
5430
5431         pcie_bandwidth_available(adev->pdev, NULL,
5432                                  &platform_speed_cap, &platform_link_width);
5433
5434         if (adev->pm.pcie_gen_mask == 0) {
5435                 /* asic caps */
5436                 pdev = adev->pdev;
5437                 speed_cap = pcie_get_speed_cap(pdev);
5438                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5439                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5440                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5441                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5442                 } else {
5443                         if (speed_cap == PCIE_SPEED_32_0GT)
5444                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5445                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5446                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5447                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5448                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5449                         else if (speed_cap == PCIE_SPEED_16_0GT)
5450                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5451                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5452                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5453                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5454                         else if (speed_cap == PCIE_SPEED_8_0GT)
5455                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5456                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5457                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5458                         else if (speed_cap == PCIE_SPEED_5_0GT)
5459                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5460                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5461                         else
5462                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5463                 }
5464                 /* platform caps */
5465                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5466                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5467                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5468                 } else {
5469                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5470                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5471                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5472                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5473                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5474                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5475                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5476                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5477                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5478                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5479                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5480                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5481                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5482                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5483                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5484                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5485                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5486                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5487                         else
5488                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5489
5490                 }
5491         }
5492         if (adev->pm.pcie_mlw_mask == 0) {
5493                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5494                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5495                 } else {
5496                         switch (platform_link_width) {
5497                         case PCIE_LNK_X32:
5498                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5499                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5500                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5501                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5502                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5503                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5504                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5505                                 break;
5506                         case PCIE_LNK_X16:
5507                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5508                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5509                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5510                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5511                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5512                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5513                                 break;
5514                         case PCIE_LNK_X12:
5515                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5516                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5517                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5518                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5519                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5520                                 break;
5521                         case PCIE_LNK_X8:
5522                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5523                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5524                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5525                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5526                                 break;
5527                         case PCIE_LNK_X4:
5528                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5529                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5530                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5531                                 break;
5532                         case PCIE_LNK_X2:
5533                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5534                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5535                                 break;
5536                         case PCIE_LNK_X1:
5537                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5538                                 break;
5539                         default:
5540                                 break;
5541                         }
5542                 }
5543         }
5544 }
5545
5546 /**
5547  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5548  *
5549  * @adev: amdgpu_device pointer
5550  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5551  *
5552  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5553  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5554  * @peer_adev.
5555  */
5556 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5557                                       struct amdgpu_device *peer_adev)
5558 {
5559 #ifdef CONFIG_HSA_AMD_P2P
5560         uint64_t address_mask = peer_adev->dev->dma_mask ?
5561                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5562         resource_size_t aper_limit =
5563                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5564         bool p2p_access =
5565                 !adev->gmc.xgmi.connected_to_cpu &&
5566                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5567
5568         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5569                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5570                 !(adev->gmc.aper_base & address_mask ||
5571                   aper_limit & address_mask));
5572 #else
5573         return false;
5574 #endif
5575 }
5576
5577 int amdgpu_device_baco_enter(struct drm_device *dev)
5578 {
5579         struct amdgpu_device *adev = drm_to_adev(dev);
5580         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5581
5582         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5583                 return -ENOTSUPP;
5584
5585         if (ras && adev->ras_enabled &&
5586             adev->nbio.funcs->enable_doorbell_interrupt)
5587                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5588
5589         return amdgpu_dpm_baco_enter(adev);
5590 }
5591
5592 int amdgpu_device_baco_exit(struct drm_device *dev)
5593 {
5594         struct amdgpu_device *adev = drm_to_adev(dev);
5595         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5596         int ret = 0;
5597
5598         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5599                 return -ENOTSUPP;
5600
5601         ret = amdgpu_dpm_baco_exit(adev);
5602         if (ret)
5603                 return ret;
5604
5605         if (ras && adev->ras_enabled &&
5606             adev->nbio.funcs->enable_doorbell_interrupt)
5607                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5608
5609         if (amdgpu_passthrough(adev) &&
5610             adev->nbio.funcs->clear_doorbell_interrupt)
5611                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5612
5613         return 0;
5614 }
5615
5616 /**
5617  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5618  * @pdev: PCI device struct
5619  * @state: PCI channel state
5620  *
5621  * Description: Called when a PCI error is detected.
5622  *
5623  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5624  */
5625 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5626 {
5627         struct drm_device *dev = pci_get_drvdata(pdev);
5628         struct amdgpu_device *adev = drm_to_adev(dev);
5629         int i;
5630
5631         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5632
5633         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5634                 DRM_WARN("No support for XGMI hive yet...");
5635                 return PCI_ERS_RESULT_DISCONNECT;
5636         }
5637
5638         adev->pci_channel_state = state;
5639
5640         switch (state) {
5641         case pci_channel_io_normal:
5642                 return PCI_ERS_RESULT_CAN_RECOVER;
5643         /* Fatal error, prepare for slot reset */
5644         case pci_channel_io_frozen:
5645                 /*
5646                  * Locking adev->reset_domain->sem will prevent any external access
5647                  * to GPU during PCI error recovery
5648                  */
5649                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5650                 amdgpu_device_set_mp1_state(adev);
5651
5652                 /*
5653                  * Block any work scheduling as we do for regular GPU reset
5654                  * for the duration of the recovery
5655                  */
5656                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5657                         struct amdgpu_ring *ring = adev->rings[i];
5658
5659                         if (!ring || !ring->sched.thread)
5660                                 continue;
5661
5662                         drm_sched_stop(&ring->sched, NULL);
5663                 }
5664                 atomic_inc(&adev->gpu_reset_counter);
5665                 return PCI_ERS_RESULT_NEED_RESET;
5666         case pci_channel_io_perm_failure:
5667                 /* Permanent error, prepare for device removal */
5668                 return PCI_ERS_RESULT_DISCONNECT;
5669         }
5670
5671         return PCI_ERS_RESULT_NEED_RESET;
5672 }
5673
5674 /**
5675  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5676  * @pdev: pointer to PCI device
5677  */
5678 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5679 {
5680
5681         DRM_INFO("PCI error: mmio enabled callback!!\n");
5682
5683         /* TODO - dump whatever for debugging purposes */
5684
5685         /* This called only if amdgpu_pci_error_detected returns
5686          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5687          * works, no need to reset slot.
5688          */
5689
5690         return PCI_ERS_RESULT_RECOVERED;
5691 }
5692
5693 /**
5694  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5695  * @pdev: PCI device struct
5696  *
5697  * Description: This routine is called by the pci error recovery
5698  * code after the PCI slot has been reset, just before we
5699  * should resume normal operations.
5700  */
5701 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5702 {
5703         struct drm_device *dev = pci_get_drvdata(pdev);
5704         struct amdgpu_device *adev = drm_to_adev(dev);
5705         int r, i;
5706         struct amdgpu_reset_context reset_context;
5707         u32 memsize;
5708         struct list_head device_list;
5709
5710         DRM_INFO("PCI error: slot reset callback!!\n");
5711
5712         memset(&reset_context, 0, sizeof(reset_context));
5713
5714         INIT_LIST_HEAD(&device_list);
5715         list_add_tail(&adev->reset_list, &device_list);
5716
5717         /* wait for asic to come out of reset */
5718         msleep(500);
5719
5720         /* Restore PCI confspace */
5721         amdgpu_device_load_pci_state(pdev);
5722
5723         /* confirm  ASIC came out of reset */
5724         for (i = 0; i < adev->usec_timeout; i++) {
5725                 memsize = amdgpu_asic_get_config_memsize(adev);
5726
5727                 if (memsize != 0xffffffff)
5728                         break;
5729                 udelay(1);
5730         }
5731         if (memsize == 0xffffffff) {
5732                 r = -ETIME;
5733                 goto out;
5734         }
5735
5736         reset_context.method = AMD_RESET_METHOD_NONE;
5737         reset_context.reset_req_dev = adev;
5738         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5739         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5740
5741         adev->no_hw_access = true;
5742         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5743         adev->no_hw_access = false;
5744         if (r)
5745                 goto out;
5746
5747         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5748
5749 out:
5750         if (!r) {
5751                 if (amdgpu_device_cache_pci_state(adev->pdev))
5752                         pci_restore_state(adev->pdev);
5753
5754                 DRM_INFO("PCIe error recovery succeeded\n");
5755         } else {
5756                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5757                 amdgpu_device_unset_mp1_state(adev);
5758                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5759         }
5760
5761         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5762 }
5763
5764 /**
5765  * amdgpu_pci_resume() - resume normal ops after PCI reset
5766  * @pdev: pointer to PCI device
5767  *
5768  * Called when the error recovery driver tells us that its
5769  * OK to resume normal operation.
5770  */
5771 void amdgpu_pci_resume(struct pci_dev *pdev)
5772 {
5773         struct drm_device *dev = pci_get_drvdata(pdev);
5774         struct amdgpu_device *adev = drm_to_adev(dev);
5775         int i;
5776
5777
5778         DRM_INFO("PCI error: resume callback!!\n");
5779
5780         /* Only continue execution for the case of pci_channel_io_frozen */
5781         if (adev->pci_channel_state != pci_channel_io_frozen)
5782                 return;
5783
5784         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5785                 struct amdgpu_ring *ring = adev->rings[i];
5786
5787                 if (!ring || !ring->sched.thread)
5788                         continue;
5789
5790                 drm_sched_start(&ring->sched, true);
5791         }
5792
5793         amdgpu_device_unset_mp1_state(adev);
5794         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5795 }
5796
5797 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5798 {
5799         struct drm_device *dev = pci_get_drvdata(pdev);
5800         struct amdgpu_device *adev = drm_to_adev(dev);
5801         int r;
5802
5803         r = pci_save_state(pdev);
5804         if (!r) {
5805                 kfree(adev->pci_state);
5806
5807                 adev->pci_state = pci_store_saved_state(pdev);
5808
5809                 if (!adev->pci_state) {
5810                         DRM_ERROR("Failed to store PCI saved state");
5811                         return false;
5812                 }
5813         } else {
5814                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5815                 return false;
5816         }
5817
5818         return true;
5819 }
5820
5821 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5822 {
5823         struct drm_device *dev = pci_get_drvdata(pdev);
5824         struct amdgpu_device *adev = drm_to_adev(dev);
5825         int r;
5826
5827         if (!adev->pci_state)
5828                 return false;
5829
5830         r = pci_load_saved_state(pdev, adev->pci_state);
5831
5832         if (!r) {
5833                 pci_restore_state(pdev);
5834         } else {
5835                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5836                 return false;
5837         }
5838
5839         return true;
5840 }
5841
5842 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5843                 struct amdgpu_ring *ring)
5844 {
5845 #ifdef CONFIG_X86_64
5846         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5847                 return;
5848 #endif
5849         if (adev->gmc.xgmi.connected_to_cpu)
5850                 return;
5851
5852         if (ring && ring->funcs->emit_hdp_flush)
5853                 amdgpu_ring_emit_hdp_flush(ring);
5854         else
5855                 amdgpu_asic_flush_hdp(adev, ring);
5856 }
5857
5858 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5859                 struct amdgpu_ring *ring)
5860 {
5861 #ifdef CONFIG_X86_64
5862         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5863                 return;
5864 #endif
5865         if (adev->gmc.xgmi.connected_to_cpu)
5866                 return;
5867
5868         amdgpu_asic_invalidate_hdp(adev, ring);
5869 }
5870
5871 int amdgpu_in_reset(struct amdgpu_device *adev)
5872 {
5873         return atomic_read(&adev->reset_domain->in_gpu_reset);
5874 }
5875
5876 /**
5877  * amdgpu_device_halt() - bring hardware to some kind of halt state
5878  *
5879  * @adev: amdgpu_device pointer
5880  *
5881  * Bring hardware to some kind of halt state so that no one can touch it
5882  * any more. It will help to maintain error context when error occurred.
5883  * Compare to a simple hang, the system will keep stable at least for SSH
5884  * access. Then it should be trivial to inspect the hardware state and
5885  * see what's going on. Implemented as following:
5886  *
5887  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5888  *    clears all CPU mappings to device, disallows remappings through page faults
5889  * 2. amdgpu_irq_disable_all() disables all interrupts
5890  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5891  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5892  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5893  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5894  *    flush any in flight DMA operations
5895  */
5896 void amdgpu_device_halt(struct amdgpu_device *adev)
5897 {
5898         struct pci_dev *pdev = adev->pdev;
5899         struct drm_device *ddev = adev_to_drm(adev);
5900
5901         drm_dev_unplug(ddev);
5902
5903         amdgpu_irq_disable_all(adev);
5904
5905         amdgpu_fence_driver_hw_fini(adev);
5906
5907         adev->no_hw_access = true;
5908
5909         amdgpu_device_unmap_mmio(adev);
5910
5911         pci_disable_device(pdev);
5912         pci_wait_for_pending_transaction(pdev);
5913 }
5914
5915 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5916                                 u32 reg)
5917 {
5918         unsigned long flags, address, data;
5919         u32 r;
5920
5921         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5922         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5923
5924         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5925         WREG32(address, reg * 4);
5926         (void)RREG32(address);
5927         r = RREG32(data);
5928         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5929         return r;
5930 }
5931
5932 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5933                                 u32 reg, u32 v)
5934 {
5935         unsigned long flags, address, data;
5936
5937         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5938         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5939
5940         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5941         WREG32(address, reg * 4);
5942         (void)RREG32(address);
5943         WREG32(data, v);
5944         (void)RREG32(data);
5945         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5946 }
5947
5948 /**
5949  * amdgpu_device_switch_gang - switch to a new gang
5950  * @adev: amdgpu_device pointer
5951  * @gang: the gang to switch to
5952  *
5953  * Try to switch to a new gang.
5954  * Returns: NULL if we switched to the new gang or a reference to the current
5955  * gang leader.
5956  */
5957 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5958                                             struct dma_fence *gang)
5959 {
5960         struct dma_fence *old = NULL;
5961
5962         do {
5963                 dma_fence_put(old);
5964                 rcu_read_lock();
5965                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5966                 rcu_read_unlock();
5967
5968                 if (old == gang)
5969                         break;
5970
5971                 if (!dma_fence_is_signaled(old))
5972                         return old;
5973
5974         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5975                          old, gang) != old);
5976
5977         dma_fence_put(old);
5978         return NULL;
5979 }
5980
5981 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
5982 {
5983         switch (adev->asic_type) {
5984 #ifdef CONFIG_DRM_AMDGPU_SI
5985         case CHIP_HAINAN:
5986 #endif
5987         case CHIP_TOPAZ:
5988                 /* chips with no display hardware */
5989                 return false;
5990 #ifdef CONFIG_DRM_AMDGPU_SI
5991         case CHIP_TAHITI:
5992         case CHIP_PITCAIRN:
5993         case CHIP_VERDE:
5994         case CHIP_OLAND:
5995 #endif
5996 #ifdef CONFIG_DRM_AMDGPU_CIK
5997         case CHIP_BONAIRE:
5998         case CHIP_HAWAII:
5999         case CHIP_KAVERI:
6000         case CHIP_KABINI:
6001         case CHIP_MULLINS:
6002 #endif
6003         case CHIP_TONGA:
6004         case CHIP_FIJI:
6005         case CHIP_POLARIS10:
6006         case CHIP_POLARIS11:
6007         case CHIP_POLARIS12:
6008         case CHIP_VEGAM:
6009         case CHIP_CARRIZO:
6010         case CHIP_STONEY:
6011                 /* chips with display hardware */
6012                 return true;
6013         default:
6014                 /* IP discovery */
6015                 if (!adev->ip_versions[DCE_HWIP][0] ||
6016                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6017                         return false;
6018                 return true;
6019         }
6020 }
This page took 0.464585 seconds and 4 git commands to generate.