]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm: bridge: ldb: Warn if LDB clock does not match requested link frequency
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38
39 #include <drm/drm_atomic_helper.h>
40 #include <drm/drm_crtc_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/vgaarb.h>
45 #include <linux/vga_switcheroo.h>
46 #include <linux/efi.h>
47 #include "amdgpu.h"
48 #include "amdgpu_trace.h"
49 #include "amdgpu_i2c.h"
50 #include "atom.h"
51 #include "amdgpu_atombios.h"
52 #include "amdgpu_atomfirmware.h"
53 #include "amd_pcie.h"
54 #ifdef CONFIG_DRM_AMDGPU_SI
55 #include "si.h"
56 #endif
57 #ifdef CONFIG_DRM_AMDGPU_CIK
58 #include "cik.h"
59 #endif
60 #include "vi.h"
61 #include "soc15.h"
62 #include "nv.h"
63 #include "bif/bif_4_1_d.h"
64 #include <linux/firmware.h>
65 #include "amdgpu_vf_error.h"
66
67 #include "amdgpu_amdkfd.h"
68 #include "amdgpu_pm.h"
69
70 #include "amdgpu_xgmi.h"
71 #include "amdgpu_ras.h"
72 #include "amdgpu_pmu.h"
73 #include "amdgpu_fru_eeprom.h"
74 #include "amdgpu_reset.h"
75
76 #include <linux/suspend.h>
77 #include <drm/task_barrier.h>
78 #include <linux/pm_runtime.h>
79
80 #include <drm/drm_drv.h>
81
82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
89
90 #define AMDGPU_RESUME_MS                2000
91 #define AMDGPU_MAX_RETRY_LIMIT          2
92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
93
94 const char *amdgpu_asic_name[] = {
95         "TAHITI",
96         "PITCAIRN",
97         "VERDE",
98         "OLAND",
99         "HAINAN",
100         "BONAIRE",
101         "KAVERI",
102         "KABINI",
103         "HAWAII",
104         "MULLINS",
105         "TOPAZ",
106         "TONGA",
107         "FIJI",
108         "CARRIZO",
109         "STONEY",
110         "POLARIS10",
111         "POLARIS11",
112         "POLARIS12",
113         "VEGAM",
114         "VEGA10",
115         "VEGA12",
116         "VEGA20",
117         "RAVEN",
118         "ARCTURUS",
119         "RENOIR",
120         "ALDEBARAN",
121         "NAVI10",
122         "CYAN_SKILLFISH",
123         "NAVI14",
124         "NAVI12",
125         "SIENNA_CICHLID",
126         "NAVY_FLOUNDER",
127         "VANGOGH",
128         "DIMGREY_CAVEFISH",
129         "BEIGE_GOBY",
130         "YELLOW_CARP",
131         "IP DISCOVERY",
132         "LAST",
133 };
134
135 /**
136  * DOC: pcie_replay_count
137  *
138  * The amdgpu driver provides a sysfs API for reporting the total number
139  * of PCIe replays (NAKs)
140  * The file pcie_replay_count is used for this and returns the total
141  * number of replays as a sum of the NAKs generated and NAKs received
142  */
143
144 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
145                 struct device_attribute *attr, char *buf)
146 {
147         struct drm_device *ddev = dev_get_drvdata(dev);
148         struct amdgpu_device *adev = drm_to_adev(ddev);
149         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
150
151         return sysfs_emit(buf, "%llu\n", cnt);
152 }
153
154 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
155                 amdgpu_device_get_pcie_replay_count, NULL);
156
157 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
158
159 /**
160  * DOC: product_name
161  *
162  * The amdgpu driver provides a sysfs API for reporting the product name
163  * for the device
164  * The file serial_number is used for this and returns the product name
165  * as returned from the FRU.
166  * NOTE: This is only available for certain server cards
167  */
168
169 static ssize_t amdgpu_device_get_product_name(struct device *dev,
170                 struct device_attribute *attr, char *buf)
171 {
172         struct drm_device *ddev = dev_get_drvdata(dev);
173         struct amdgpu_device *adev = drm_to_adev(ddev);
174
175         return sysfs_emit(buf, "%s\n", adev->product_name);
176 }
177
178 static DEVICE_ATTR(product_name, S_IRUGO,
179                 amdgpu_device_get_product_name, NULL);
180
181 /**
182  * DOC: product_number
183  *
184  * The amdgpu driver provides a sysfs API for reporting the part number
185  * for the device
186  * The file serial_number is used for this and returns the part number
187  * as returned from the FRU.
188  * NOTE: This is only available for certain server cards
189  */
190
191 static ssize_t amdgpu_device_get_product_number(struct device *dev,
192                 struct device_attribute *attr, char *buf)
193 {
194         struct drm_device *ddev = dev_get_drvdata(dev);
195         struct amdgpu_device *adev = drm_to_adev(ddev);
196
197         return sysfs_emit(buf, "%s\n", adev->product_number);
198 }
199
200 static DEVICE_ATTR(product_number, S_IRUGO,
201                 amdgpu_device_get_product_number, NULL);
202
203 /**
204  * DOC: serial_number
205  *
206  * The amdgpu driver provides a sysfs API for reporting the serial number
207  * for the device
208  * The file serial_number is used for this and returns the serial number
209  * as returned from the FRU.
210  * NOTE: This is only available for certain server cards
211  */
212
213 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
214                 struct device_attribute *attr, char *buf)
215 {
216         struct drm_device *ddev = dev_get_drvdata(dev);
217         struct amdgpu_device *adev = drm_to_adev(ddev);
218
219         return sysfs_emit(buf, "%s\n", adev->serial);
220 }
221
222 static DEVICE_ATTR(serial_number, S_IRUGO,
223                 amdgpu_device_get_serial_number, NULL);
224
225 /**
226  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
227  *
228  * @dev: drm_device pointer
229  *
230  * Returns true if the device is a dGPU with ATPX power control,
231  * otherwise return false.
232  */
233 bool amdgpu_device_supports_px(struct drm_device *dev)
234 {
235         struct amdgpu_device *adev = drm_to_adev(dev);
236
237         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
238                 return true;
239         return false;
240 }
241
242 /**
243  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
244  *
245  * @dev: drm_device pointer
246  *
247  * Returns true if the device is a dGPU with ACPI power control,
248  * otherwise return false.
249  */
250 bool amdgpu_device_supports_boco(struct drm_device *dev)
251 {
252         struct amdgpu_device *adev = drm_to_adev(dev);
253
254         if (adev->has_pr3 ||
255             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
256                 return true;
257         return false;
258 }
259
260 /**
261  * amdgpu_device_supports_baco - Does the device support BACO
262  *
263  * @dev: drm_device pointer
264  *
265  * Returns true if the device supporte BACO,
266  * otherwise return false.
267  */
268 bool amdgpu_device_supports_baco(struct drm_device *dev)
269 {
270         struct amdgpu_device *adev = drm_to_adev(dev);
271
272         return amdgpu_asic_supports_baco(adev);
273 }
274
275 /**
276  * amdgpu_device_supports_smart_shift - Is the device dGPU with
277  * smart shift support
278  *
279  * @dev: drm_device pointer
280  *
281  * Returns true if the device is a dGPU with Smart Shift support,
282  * otherwise returns false.
283  */
284 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
285 {
286         return (amdgpu_device_supports_boco(dev) &&
287                 amdgpu_acpi_is_power_shift_control_supported());
288 }
289
290 /*
291  * VRAM access helper functions
292  */
293
294 /**
295  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
296  *
297  * @adev: amdgpu_device pointer
298  * @pos: offset of the buffer in vram
299  * @buf: virtual address of the buffer in system memory
300  * @size: read/write size, sizeof(@buf) must > @size
301  * @write: true - write to vram, otherwise - read from vram
302  */
303 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
304                              void *buf, size_t size, bool write)
305 {
306         unsigned long flags;
307         uint32_t hi = ~0, tmp = 0;
308         uint32_t *data = buf;
309         uint64_t last;
310         int idx;
311
312         if (!drm_dev_enter(adev_to_drm(adev), &idx))
313                 return;
314
315         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
316
317         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
318         for (last = pos + size; pos < last; pos += 4) {
319                 tmp = pos >> 31;
320
321                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
322                 if (tmp != hi) {
323                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
324                         hi = tmp;
325                 }
326                 if (write)
327                         WREG32_NO_KIQ(mmMM_DATA, *data++);
328                 else
329                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
330         }
331
332         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
333         drm_dev_exit(idx);
334 }
335
336 /**
337  * amdgpu_device_aper_access - access vram by vram aperature
338  *
339  * @adev: amdgpu_device pointer
340  * @pos: offset of the buffer in vram
341  * @buf: virtual address of the buffer in system memory
342  * @size: read/write size, sizeof(@buf) must > @size
343  * @write: true - write to vram, otherwise - read from vram
344  *
345  * The return value means how many bytes have been transferred.
346  */
347 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
348                                  void *buf, size_t size, bool write)
349 {
350 #ifdef CONFIG_64BIT
351         void __iomem *addr;
352         size_t count = 0;
353         uint64_t last;
354
355         if (!adev->mman.aper_base_kaddr)
356                 return 0;
357
358         last = min(pos + size, adev->gmc.visible_vram_size);
359         if (last > pos) {
360                 addr = adev->mman.aper_base_kaddr + pos;
361                 count = last - pos;
362
363                 if (write) {
364                         memcpy_toio(addr, buf, count);
365                         mb();
366                         amdgpu_device_flush_hdp(adev, NULL);
367                 } else {
368                         amdgpu_device_invalidate_hdp(adev, NULL);
369                         mb();
370                         memcpy_fromio(buf, addr, count);
371                 }
372
373         }
374
375         return count;
376 #else
377         return 0;
378 #endif
379 }
380
381 /**
382  * amdgpu_device_vram_access - read/write a buffer in vram
383  *
384  * @adev: amdgpu_device pointer
385  * @pos: offset of the buffer in vram
386  * @buf: virtual address of the buffer in system memory
387  * @size: read/write size, sizeof(@buf) must > @size
388  * @write: true - write to vram, otherwise - read from vram
389  */
390 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
391                                void *buf, size_t size, bool write)
392 {
393         size_t count;
394
395         /* try to using vram apreature to access vram first */
396         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
397         size -= count;
398         if (size) {
399                 /* using MM to access rest vram */
400                 pos += count;
401                 buf += count;
402                 amdgpu_device_mm_access(adev, pos, buf, size, write);
403         }
404 }
405
406 /*
407  * register access helper functions.
408  */
409
410 /* Check if hw access should be skipped because of hotplug or device error */
411 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
412 {
413         if (adev->no_hw_access)
414                 return true;
415
416 #ifdef CONFIG_LOCKDEP
417         /*
418          * This is a bit complicated to understand, so worth a comment. What we assert
419          * here is that the GPU reset is not running on another thread in parallel.
420          *
421          * For this we trylock the read side of the reset semaphore, if that succeeds
422          * we know that the reset is not running in paralell.
423          *
424          * If the trylock fails we assert that we are either already holding the read
425          * side of the lock or are the reset thread itself and hold the write side of
426          * the lock.
427          */
428         if (in_task()) {
429                 if (down_read_trylock(&adev->reset_domain->sem))
430                         up_read(&adev->reset_domain->sem);
431                 else
432                         lockdep_assert_held(&adev->reset_domain->sem);
433         }
434 #endif
435         return false;
436 }
437
438 /**
439  * amdgpu_device_rreg - read a memory mapped IO or indirect register
440  *
441  * @adev: amdgpu_device pointer
442  * @reg: dword aligned register offset
443  * @acc_flags: access flags which require special behavior
444  *
445  * Returns the 32 bit value from the offset specified.
446  */
447 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
448                             uint32_t reg, uint32_t acc_flags)
449 {
450         uint32_t ret;
451
452         if (amdgpu_device_skip_hw_access(adev))
453                 return 0;
454
455         if ((reg * 4) < adev->rmmio_size) {
456                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
457                     amdgpu_sriov_runtime(adev) &&
458                     down_read_trylock(&adev->reset_domain->sem)) {
459                         ret = amdgpu_kiq_rreg(adev, reg);
460                         up_read(&adev->reset_domain->sem);
461                 } else {
462                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
463                 }
464         } else {
465                 ret = adev->pcie_rreg(adev, reg * 4);
466         }
467
468         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
469
470         return ret;
471 }
472
473 /*
474  * MMIO register read with bytes helper functions
475  * @offset:bytes offset from MMIO start
476  *
477 */
478
479 /**
480  * amdgpu_mm_rreg8 - read a memory mapped IO register
481  *
482  * @adev: amdgpu_device pointer
483  * @offset: byte aligned register offset
484  *
485  * Returns the 8 bit value from the offset specified.
486  */
487 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
488 {
489         if (amdgpu_device_skip_hw_access(adev))
490                 return 0;
491
492         if (offset < adev->rmmio_size)
493                 return (readb(adev->rmmio + offset));
494         BUG();
495 }
496
497 /*
498  * MMIO register write with bytes helper functions
499  * @offset:bytes offset from MMIO start
500  * @value: the value want to be written to the register
501  *
502 */
503 /**
504  * amdgpu_mm_wreg8 - read a memory mapped IO register
505  *
506  * @adev: amdgpu_device pointer
507  * @offset: byte aligned register offset
508  * @value: 8 bit value to write
509  *
510  * Writes the value specified to the offset specified.
511  */
512 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
513 {
514         if (amdgpu_device_skip_hw_access(adev))
515                 return;
516
517         if (offset < adev->rmmio_size)
518                 writeb(value, adev->rmmio + offset);
519         else
520                 BUG();
521 }
522
523 /**
524  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
525  *
526  * @adev: amdgpu_device pointer
527  * @reg: dword aligned register offset
528  * @v: 32 bit value to write to the register
529  * @acc_flags: access flags which require special behavior
530  *
531  * Writes the value specified to the offset specified.
532  */
533 void amdgpu_device_wreg(struct amdgpu_device *adev,
534                         uint32_t reg, uint32_t v,
535                         uint32_t acc_flags)
536 {
537         if (amdgpu_device_skip_hw_access(adev))
538                 return;
539
540         if ((reg * 4) < adev->rmmio_size) {
541                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
542                     amdgpu_sriov_runtime(adev) &&
543                     down_read_trylock(&adev->reset_domain->sem)) {
544                         amdgpu_kiq_wreg(adev, reg, v);
545                         up_read(&adev->reset_domain->sem);
546                 } else {
547                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
548                 }
549         } else {
550                 adev->pcie_wreg(adev, reg * 4, v);
551         }
552
553         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
554 }
555
556 /**
557  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
558  *
559  * @adev: amdgpu_device pointer
560  * @reg: mmio/rlc register
561  * @v: value to write
562  *
563  * this function is invoked only for the debugfs register access
564  */
565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
566                              uint32_t reg, uint32_t v)
567 {
568         if (amdgpu_device_skip_hw_access(adev))
569                 return;
570
571         if (amdgpu_sriov_fullaccess(adev) &&
572             adev->gfx.rlc.funcs &&
573             adev->gfx.rlc.funcs->is_rlcg_access_range) {
574                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
575                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
576         } else if ((reg * 4) >= adev->rmmio_size) {
577                 adev->pcie_wreg(adev, reg * 4, v);
578         } else {
579                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
580         }
581 }
582
583 /**
584  * amdgpu_mm_rdoorbell - read a doorbell dword
585  *
586  * @adev: amdgpu_device pointer
587  * @index: doorbell index
588  *
589  * Returns the value in the doorbell aperture at the
590  * requested doorbell index (CIK).
591  */
592 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
593 {
594         if (amdgpu_device_skip_hw_access(adev))
595                 return 0;
596
597         if (index < adev->doorbell.num_doorbells) {
598                 return readl(adev->doorbell.ptr + index);
599         } else {
600                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
601                 return 0;
602         }
603 }
604
605 /**
606  * amdgpu_mm_wdoorbell - write a doorbell dword
607  *
608  * @adev: amdgpu_device pointer
609  * @index: doorbell index
610  * @v: value to write
611  *
612  * Writes @v to the doorbell aperture at the
613  * requested doorbell index (CIK).
614  */
615 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
616 {
617         if (amdgpu_device_skip_hw_access(adev))
618                 return;
619
620         if (index < adev->doorbell.num_doorbells) {
621                 writel(v, adev->doorbell.ptr + index);
622         } else {
623                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
624         }
625 }
626
627 /**
628  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
629  *
630  * @adev: amdgpu_device pointer
631  * @index: doorbell index
632  *
633  * Returns the value in the doorbell aperture at the
634  * requested doorbell index (VEGA10+).
635  */
636 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
637 {
638         if (amdgpu_device_skip_hw_access(adev))
639                 return 0;
640
641         if (index < adev->doorbell.num_doorbells) {
642                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
643         } else {
644                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
645                 return 0;
646         }
647 }
648
649 /**
650  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
651  *
652  * @adev: amdgpu_device pointer
653  * @index: doorbell index
654  * @v: value to write
655  *
656  * Writes @v to the doorbell aperture at the
657  * requested doorbell index (VEGA10+).
658  */
659 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
660 {
661         if (amdgpu_device_skip_hw_access(adev))
662                 return;
663
664         if (index < adev->doorbell.num_doorbells) {
665                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
666         } else {
667                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
668         }
669 }
670
671 /**
672  * amdgpu_device_indirect_rreg - read an indirect register
673  *
674  * @adev: amdgpu_device pointer
675  * @pcie_index: mmio register offset
676  * @pcie_data: mmio register offset
677  * @reg_addr: indirect register address to read from
678  *
679  * Returns the value of indirect register @reg_addr
680  */
681 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
682                                 u32 pcie_index, u32 pcie_data,
683                                 u32 reg_addr)
684 {
685         unsigned long flags;
686         u32 r;
687         void __iomem *pcie_index_offset;
688         void __iomem *pcie_data_offset;
689
690         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693
694         writel(reg_addr, pcie_index_offset);
695         readl(pcie_index_offset);
696         r = readl(pcie_data_offset);
697         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
698
699         return r;
700 }
701
702 /**
703  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
704  *
705  * @adev: amdgpu_device pointer
706  * @pcie_index: mmio register offset
707  * @pcie_data: mmio register offset
708  * @reg_addr: indirect register address to read from
709  *
710  * Returns the value of indirect register @reg_addr
711  */
712 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
713                                   u32 pcie_index, u32 pcie_data,
714                                   u32 reg_addr)
715 {
716         unsigned long flags;
717         u64 r;
718         void __iomem *pcie_index_offset;
719         void __iomem *pcie_data_offset;
720
721         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
722         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
723         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
724
725         /* read low 32 bits */
726         writel(reg_addr, pcie_index_offset);
727         readl(pcie_index_offset);
728         r = readl(pcie_data_offset);
729         /* read high 32 bits */
730         writel(reg_addr + 4, pcie_index_offset);
731         readl(pcie_index_offset);
732         r |= ((u64)readl(pcie_data_offset) << 32);
733         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
734
735         return r;
736 }
737
738 /**
739  * amdgpu_device_indirect_wreg - write an indirect register address
740  *
741  * @adev: amdgpu_device pointer
742  * @pcie_index: mmio register offset
743  * @pcie_data: mmio register offset
744  * @reg_addr: indirect register offset
745  * @reg_data: indirect register data
746  *
747  */
748 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
749                                  u32 pcie_index, u32 pcie_data,
750                                  u32 reg_addr, u32 reg_data)
751 {
752         unsigned long flags;
753         void __iomem *pcie_index_offset;
754         void __iomem *pcie_data_offset;
755
756         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
757         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
758         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
759
760         writel(reg_addr, pcie_index_offset);
761         readl(pcie_index_offset);
762         writel(reg_data, pcie_data_offset);
763         readl(pcie_data_offset);
764         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
765 }
766
767 /**
768  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
769  *
770  * @adev: amdgpu_device pointer
771  * @pcie_index: mmio register offset
772  * @pcie_data: mmio register offset
773  * @reg_addr: indirect register offset
774  * @reg_data: indirect register data
775  *
776  */
777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
778                                    u32 pcie_index, u32 pcie_data,
779                                    u32 reg_addr, u64 reg_data)
780 {
781         unsigned long flags;
782         void __iomem *pcie_index_offset;
783         void __iomem *pcie_data_offset;
784
785         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
786         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
787         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
788
789         /* write low 32 bits */
790         writel(reg_addr, pcie_index_offset);
791         readl(pcie_index_offset);
792         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
793         readl(pcie_data_offset);
794         /* write high 32 bits */
795         writel(reg_addr + 4, pcie_index_offset);
796         readl(pcie_index_offset);
797         writel((u32)(reg_data >> 32), pcie_data_offset);
798         readl(pcie_data_offset);
799         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
800 }
801
802 /**
803  * amdgpu_invalid_rreg - dummy reg read function
804  *
805  * @adev: amdgpu_device pointer
806  * @reg: offset of register
807  *
808  * Dummy register read function.  Used for register blocks
809  * that certain asics don't have (all asics).
810  * Returns the value in the register.
811  */
812 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
813 {
814         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
815         BUG();
816         return 0;
817 }
818
819 /**
820  * amdgpu_invalid_wreg - dummy reg write function
821  *
822  * @adev: amdgpu_device pointer
823  * @reg: offset of register
824  * @v: value to write to the register
825  *
826  * Dummy register read function.  Used for register blocks
827  * that certain asics don't have (all asics).
828  */
829 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
830 {
831         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
832                   reg, v);
833         BUG();
834 }
835
836 /**
837  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
838  *
839  * @adev: amdgpu_device pointer
840  * @reg: offset of register
841  *
842  * Dummy register read function.  Used for register blocks
843  * that certain asics don't have (all asics).
844  * Returns the value in the register.
845  */
846 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
847 {
848         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
849         BUG();
850         return 0;
851 }
852
853 /**
854  * amdgpu_invalid_wreg64 - dummy reg write function
855  *
856  * @adev: amdgpu_device pointer
857  * @reg: offset of register
858  * @v: value to write to the register
859  *
860  * Dummy register read function.  Used for register blocks
861  * that certain asics don't have (all asics).
862  */
863 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
864 {
865         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
866                   reg, v);
867         BUG();
868 }
869
870 /**
871  * amdgpu_block_invalid_rreg - dummy reg read function
872  *
873  * @adev: amdgpu_device pointer
874  * @block: offset of instance
875  * @reg: offset of register
876  *
877  * Dummy register read function.  Used for register blocks
878  * that certain asics don't have (all asics).
879  * Returns the value in the register.
880  */
881 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
882                                           uint32_t block, uint32_t reg)
883 {
884         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
885                   reg, block);
886         BUG();
887         return 0;
888 }
889
890 /**
891  * amdgpu_block_invalid_wreg - dummy reg write function
892  *
893  * @adev: amdgpu_device pointer
894  * @block: offset of instance
895  * @reg: offset of register
896  * @v: value to write to the register
897  *
898  * Dummy register read function.  Used for register blocks
899  * that certain asics don't have (all asics).
900  */
901 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
902                                       uint32_t block,
903                                       uint32_t reg, uint32_t v)
904 {
905         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
906                   reg, block, v);
907         BUG();
908 }
909
910 /**
911  * amdgpu_device_asic_init - Wrapper for atom asic_init
912  *
913  * @adev: amdgpu_device pointer
914  *
915  * Does any asic specific work and then calls atom asic init.
916  */
917 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
918 {
919         amdgpu_asic_pre_asic_init(adev);
920
921         if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
922                 return amdgpu_atomfirmware_asic_init(adev, true);
923         else
924                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
925 }
926
927 /**
928  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
929  *
930  * @adev: amdgpu_device pointer
931  *
932  * Allocates a scratch page of VRAM for use by various things in the
933  * driver.
934  */
935 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
936 {
937         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
938                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
939                                        &adev->vram_scratch.robj,
940                                        &adev->vram_scratch.gpu_addr,
941                                        (void **)&adev->vram_scratch.ptr);
942 }
943
944 /**
945  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
946  *
947  * @adev: amdgpu_device pointer
948  *
949  * Frees the VRAM scratch page.
950  */
951 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
952 {
953         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
954 }
955
956 /**
957  * amdgpu_device_program_register_sequence - program an array of registers.
958  *
959  * @adev: amdgpu_device pointer
960  * @registers: pointer to the register array
961  * @array_size: size of the register array
962  *
963  * Programs an array or registers with and and or masks.
964  * This is a helper for setting golden registers.
965  */
966 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
967                                              const u32 *registers,
968                                              const u32 array_size)
969 {
970         u32 tmp, reg, and_mask, or_mask;
971         int i;
972
973         if (array_size % 3)
974                 return;
975
976         for (i = 0; i < array_size; i +=3) {
977                 reg = registers[i + 0];
978                 and_mask = registers[i + 1];
979                 or_mask = registers[i + 2];
980
981                 if (and_mask == 0xffffffff) {
982                         tmp = or_mask;
983                 } else {
984                         tmp = RREG32(reg);
985                         tmp &= ~and_mask;
986                         if (adev->family >= AMDGPU_FAMILY_AI)
987                                 tmp |= (or_mask & and_mask);
988                         else
989                                 tmp |= or_mask;
990                 }
991                 WREG32(reg, tmp);
992         }
993 }
994
995 /**
996  * amdgpu_device_pci_config_reset - reset the GPU
997  *
998  * @adev: amdgpu_device pointer
999  *
1000  * Resets the GPU using the pci config reset sequence.
1001  * Only applicable to asics prior to vega10.
1002  */
1003 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1004 {
1005         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1006 }
1007
1008 /**
1009  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1010  *
1011  * @adev: amdgpu_device pointer
1012  *
1013  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1014  */
1015 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1016 {
1017         return pci_reset_function(adev->pdev);
1018 }
1019
1020 /*
1021  * GPU doorbell aperture helpers function.
1022  */
1023 /**
1024  * amdgpu_device_doorbell_init - Init doorbell driver information.
1025  *
1026  * @adev: amdgpu_device pointer
1027  *
1028  * Init doorbell driver information (CIK)
1029  * Returns 0 on success, error on failure.
1030  */
1031 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1032 {
1033
1034         /* No doorbell on SI hardware generation */
1035         if (adev->asic_type < CHIP_BONAIRE) {
1036                 adev->doorbell.base = 0;
1037                 adev->doorbell.size = 0;
1038                 adev->doorbell.num_doorbells = 0;
1039                 adev->doorbell.ptr = NULL;
1040                 return 0;
1041         }
1042
1043         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1044                 return -EINVAL;
1045
1046         amdgpu_asic_init_doorbell_index(adev);
1047
1048         /* doorbell bar mapping */
1049         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1050         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1051
1052         if (adev->enable_mes) {
1053                 adev->doorbell.num_doorbells =
1054                         adev->doorbell.size / sizeof(u32);
1055         } else {
1056                 adev->doorbell.num_doorbells =
1057                         min_t(u32, adev->doorbell.size / sizeof(u32),
1058                               adev->doorbell_index.max_assignment+1);
1059                 if (adev->doorbell.num_doorbells == 0)
1060                         return -EINVAL;
1061
1062                 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1063                  * paging queue doorbell use the second page. The
1064                  * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1065                  * doorbells are in the first page. So with paging queue enabled,
1066                  * the max num_doorbells should + 1 page (0x400 in dword)
1067                  */
1068                 if (adev->asic_type >= CHIP_VEGA10)
1069                         adev->doorbell.num_doorbells += 0x400;
1070         }
1071
1072         adev->doorbell.ptr = ioremap(adev->doorbell.base,
1073                                      adev->doorbell.num_doorbells *
1074                                      sizeof(u32));
1075         if (adev->doorbell.ptr == NULL)
1076                 return -ENOMEM;
1077
1078         return 0;
1079 }
1080
1081 /**
1082  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1083  *
1084  * @adev: amdgpu_device pointer
1085  *
1086  * Tear down doorbell driver information (CIK)
1087  */
1088 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1089 {
1090         iounmap(adev->doorbell.ptr);
1091         adev->doorbell.ptr = NULL;
1092 }
1093
1094
1095
1096 /*
1097  * amdgpu_device_wb_*()
1098  * Writeback is the method by which the GPU updates special pages in memory
1099  * with the status of certain GPU events (fences, ring pointers,etc.).
1100  */
1101
1102 /**
1103  * amdgpu_device_wb_fini - Disable Writeback and free memory
1104  *
1105  * @adev: amdgpu_device pointer
1106  *
1107  * Disables Writeback and frees the Writeback memory (all asics).
1108  * Used at driver shutdown.
1109  */
1110 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1111 {
1112         if (adev->wb.wb_obj) {
1113                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1114                                       &adev->wb.gpu_addr,
1115                                       (void **)&adev->wb.wb);
1116                 adev->wb.wb_obj = NULL;
1117         }
1118 }
1119
1120 /**
1121  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1122  *
1123  * @adev: amdgpu_device pointer
1124  *
1125  * Initializes writeback and allocates writeback memory (all asics).
1126  * Used at driver startup.
1127  * Returns 0 on success or an -error on failure.
1128  */
1129 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1130 {
1131         int r;
1132
1133         if (adev->wb.wb_obj == NULL) {
1134                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1135                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1136                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1137                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1138                                             (void **)&adev->wb.wb);
1139                 if (r) {
1140                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1141                         return r;
1142                 }
1143
1144                 adev->wb.num_wb = AMDGPU_MAX_WB;
1145                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1146
1147                 /* clear wb memory */
1148                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1149         }
1150
1151         return 0;
1152 }
1153
1154 /**
1155  * amdgpu_device_wb_get - Allocate a wb entry
1156  *
1157  * @adev: amdgpu_device pointer
1158  * @wb: wb index
1159  *
1160  * Allocate a wb slot for use by the driver (all asics).
1161  * Returns 0 on success or -EINVAL on failure.
1162  */
1163 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1164 {
1165         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1166
1167         if (offset < adev->wb.num_wb) {
1168                 __set_bit(offset, adev->wb.used);
1169                 *wb = offset << 3; /* convert to dw offset */
1170                 return 0;
1171         } else {
1172                 return -EINVAL;
1173         }
1174 }
1175
1176 /**
1177  * amdgpu_device_wb_free - Free a wb entry
1178  *
1179  * @adev: amdgpu_device pointer
1180  * @wb: wb index
1181  *
1182  * Free a wb slot allocated for use by the driver (all asics)
1183  */
1184 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1185 {
1186         wb >>= 3;
1187         if (wb < adev->wb.num_wb)
1188                 __clear_bit(wb, adev->wb.used);
1189 }
1190
1191 /**
1192  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1193  *
1194  * @adev: amdgpu_device pointer
1195  *
1196  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1197  * to fail, but if any of the BARs is not accessible after the size we abort
1198  * driver loading by returning -ENODEV.
1199  */
1200 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1201 {
1202         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1203         struct pci_bus *root;
1204         struct resource *res;
1205         unsigned i;
1206         u16 cmd;
1207         int r;
1208
1209         /* Bypass for VF */
1210         if (amdgpu_sriov_vf(adev))
1211                 return 0;
1212
1213         /* skip if the bios has already enabled large BAR */
1214         if (adev->gmc.real_vram_size &&
1215             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1216                 return 0;
1217
1218         /* Check if the root BUS has 64bit memory resources */
1219         root = adev->pdev->bus;
1220         while (root->parent)
1221                 root = root->parent;
1222
1223         pci_bus_for_each_resource(root, res, i) {
1224                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1225                     res->start > 0x100000000ull)
1226                         break;
1227         }
1228
1229         /* Trying to resize is pointless without a root hub window above 4GB */
1230         if (!res)
1231                 return 0;
1232
1233         /* Limit the BAR size to what is available */
1234         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1235                         rbar_size);
1236
1237         /* Disable memory decoding while we change the BAR addresses and size */
1238         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1239         pci_write_config_word(adev->pdev, PCI_COMMAND,
1240                               cmd & ~PCI_COMMAND_MEMORY);
1241
1242         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1243         amdgpu_device_doorbell_fini(adev);
1244         if (adev->asic_type >= CHIP_BONAIRE)
1245                 pci_release_resource(adev->pdev, 2);
1246
1247         pci_release_resource(adev->pdev, 0);
1248
1249         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1250         if (r == -ENOSPC)
1251                 DRM_INFO("Not enough PCI address space for a large BAR.");
1252         else if (r && r != -ENOTSUPP)
1253                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1254
1255         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1256
1257         /* When the doorbell or fb BAR isn't available we have no chance of
1258          * using the device.
1259          */
1260         r = amdgpu_device_doorbell_init(adev);
1261         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1262                 return -ENODEV;
1263
1264         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1265
1266         return 0;
1267 }
1268
1269 /*
1270  * GPU helpers function.
1271  */
1272 /**
1273  * amdgpu_device_need_post - check if the hw need post or not
1274  *
1275  * @adev: amdgpu_device pointer
1276  *
1277  * Check if the asic has been initialized (all asics) at driver startup
1278  * or post is needed if  hw reset is performed.
1279  * Returns true if need or false if not.
1280  */
1281 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1282 {
1283         uint32_t reg;
1284
1285         if (amdgpu_sriov_vf(adev))
1286                 return false;
1287
1288         if (amdgpu_passthrough(adev)) {
1289                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1290                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1291                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1292                  * vpost executed for smc version below 22.15
1293                  */
1294                 if (adev->asic_type == CHIP_FIJI) {
1295                         int err;
1296                         uint32_t fw_ver;
1297                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1298                         /* force vPost if error occured */
1299                         if (err)
1300                                 return true;
1301
1302                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1303                         if (fw_ver < 0x00160e00)
1304                                 return true;
1305                 }
1306         }
1307
1308         /* Don't post if we need to reset whole hive on init */
1309         if (adev->gmc.xgmi.pending_reset)
1310                 return false;
1311
1312         if (adev->has_hw_reset) {
1313                 adev->has_hw_reset = false;
1314                 return true;
1315         }
1316
1317         /* bios scratch used on CIK+ */
1318         if (adev->asic_type >= CHIP_BONAIRE)
1319                 return amdgpu_atombios_scratch_need_asic_init(adev);
1320
1321         /* check MEM_SIZE for older asics */
1322         reg = amdgpu_asic_get_config_memsize(adev);
1323
1324         if ((reg != 0) && (reg != 0xffffffff))
1325                 return false;
1326
1327         return true;
1328 }
1329
1330 /**
1331  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1332  *
1333  * @adev: amdgpu_device pointer
1334  *
1335  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1336  * be set for this device.
1337  *
1338  * Returns true if it should be used or false if not.
1339  */
1340 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1341 {
1342         switch (amdgpu_aspm) {
1343         case -1:
1344                 break;
1345         case 0:
1346                 return false;
1347         case 1:
1348                 return true;
1349         default:
1350                 return false;
1351         }
1352         return pcie_aspm_enabled(adev->pdev);
1353 }
1354
1355 /* if we get transitioned to only one device, take VGA back */
1356 /**
1357  * amdgpu_device_vga_set_decode - enable/disable vga decode
1358  *
1359  * @pdev: PCI device pointer
1360  * @state: enable/disable vga decode
1361  *
1362  * Enable/disable vga decode (all asics).
1363  * Returns VGA resource flags.
1364  */
1365 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1366                 bool state)
1367 {
1368         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1369         amdgpu_asic_set_vga_state(adev, state);
1370         if (state)
1371                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1372                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1373         else
1374                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1375 }
1376
1377 /**
1378  * amdgpu_device_check_block_size - validate the vm block size
1379  *
1380  * @adev: amdgpu_device pointer
1381  *
1382  * Validates the vm block size specified via module parameter.
1383  * The vm block size defines number of bits in page table versus page directory,
1384  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1385  * page table and the remaining bits are in the page directory.
1386  */
1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1388 {
1389         /* defines number of bits in page table versus page directory,
1390          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1391          * page table and the remaining bits are in the page directory */
1392         if (amdgpu_vm_block_size == -1)
1393                 return;
1394
1395         if (amdgpu_vm_block_size < 9) {
1396                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1397                          amdgpu_vm_block_size);
1398                 amdgpu_vm_block_size = -1;
1399         }
1400 }
1401
1402 /**
1403  * amdgpu_device_check_vm_size - validate the vm size
1404  *
1405  * @adev: amdgpu_device pointer
1406  *
1407  * Validates the vm size in GB specified via module parameter.
1408  * The VM size is the size of the GPU virtual memory space in GB.
1409  */
1410 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1411 {
1412         /* no need to check the default value */
1413         if (amdgpu_vm_size == -1)
1414                 return;
1415
1416         if (amdgpu_vm_size < 1) {
1417                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1418                          amdgpu_vm_size);
1419                 amdgpu_vm_size = -1;
1420         }
1421 }
1422
1423 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1424 {
1425         struct sysinfo si;
1426         bool is_os_64 = (sizeof(void *) == 8);
1427         uint64_t total_memory;
1428         uint64_t dram_size_seven_GB = 0x1B8000000;
1429         uint64_t dram_size_three_GB = 0xB8000000;
1430
1431         if (amdgpu_smu_memory_pool_size == 0)
1432                 return;
1433
1434         if (!is_os_64) {
1435                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1436                 goto def_value;
1437         }
1438         si_meminfo(&si);
1439         total_memory = (uint64_t)si.totalram * si.mem_unit;
1440
1441         if ((amdgpu_smu_memory_pool_size == 1) ||
1442                 (amdgpu_smu_memory_pool_size == 2)) {
1443                 if (total_memory < dram_size_three_GB)
1444                         goto def_value1;
1445         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1446                 (amdgpu_smu_memory_pool_size == 8)) {
1447                 if (total_memory < dram_size_seven_GB)
1448                         goto def_value1;
1449         } else {
1450                 DRM_WARN("Smu memory pool size not supported\n");
1451                 goto def_value;
1452         }
1453         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1454
1455         return;
1456
1457 def_value1:
1458         DRM_WARN("No enough system memory\n");
1459 def_value:
1460         adev->pm.smu_prv_buffer_size = 0;
1461 }
1462
1463 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1464 {
1465         if (!(adev->flags & AMD_IS_APU) ||
1466             adev->asic_type < CHIP_RAVEN)
1467                 return 0;
1468
1469         switch (adev->asic_type) {
1470         case CHIP_RAVEN:
1471                 if (adev->pdev->device == 0x15dd)
1472                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1473                 if (adev->pdev->device == 0x15d8)
1474                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1475                 break;
1476         case CHIP_RENOIR:
1477                 if ((adev->pdev->device == 0x1636) ||
1478                     (adev->pdev->device == 0x164c))
1479                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1480                 else
1481                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1482                 break;
1483         case CHIP_VANGOGH:
1484                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1485                 break;
1486         case CHIP_YELLOW_CARP:
1487                 break;
1488         case CHIP_CYAN_SKILLFISH:
1489                 if ((adev->pdev->device == 0x13FE) ||
1490                     (adev->pdev->device == 0x143F))
1491                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1492                 break;
1493         default:
1494                 break;
1495         }
1496
1497         return 0;
1498 }
1499
1500 /**
1501  * amdgpu_device_check_arguments - validate module params
1502  *
1503  * @adev: amdgpu_device pointer
1504  *
1505  * Validates certain module parameters and updates
1506  * the associated values used by the driver (all asics).
1507  */
1508 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1509 {
1510         if (amdgpu_sched_jobs < 4) {
1511                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1512                          amdgpu_sched_jobs);
1513                 amdgpu_sched_jobs = 4;
1514         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1515                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1516                          amdgpu_sched_jobs);
1517                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1518         }
1519
1520         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1521                 /* gart size must be greater or equal to 32M */
1522                 dev_warn(adev->dev, "gart size (%d) too small\n",
1523                          amdgpu_gart_size);
1524                 amdgpu_gart_size = -1;
1525         }
1526
1527         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1528                 /* gtt size must be greater or equal to 32M */
1529                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1530                                  amdgpu_gtt_size);
1531                 amdgpu_gtt_size = -1;
1532         }
1533
1534         /* valid range is between 4 and 9 inclusive */
1535         if (amdgpu_vm_fragment_size != -1 &&
1536             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1537                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1538                 amdgpu_vm_fragment_size = -1;
1539         }
1540
1541         if (amdgpu_sched_hw_submission < 2) {
1542                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1543                          amdgpu_sched_hw_submission);
1544                 amdgpu_sched_hw_submission = 2;
1545         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1546                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1547                          amdgpu_sched_hw_submission);
1548                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1549         }
1550
1551         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1552                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1553                 amdgpu_reset_method = -1;
1554         }
1555
1556         amdgpu_device_check_smu_prv_buffer_size(adev);
1557
1558         amdgpu_device_check_vm_size(adev);
1559
1560         amdgpu_device_check_block_size(adev);
1561
1562         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1563
1564         return 0;
1565 }
1566
1567 /**
1568  * amdgpu_switcheroo_set_state - set switcheroo state
1569  *
1570  * @pdev: pci dev pointer
1571  * @state: vga_switcheroo state
1572  *
1573  * Callback for the switcheroo driver.  Suspends or resumes
1574  * the asics before or after it is powered up using ACPI methods.
1575  */
1576 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1577                                         enum vga_switcheroo_state state)
1578 {
1579         struct drm_device *dev = pci_get_drvdata(pdev);
1580         int r;
1581
1582         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1583                 return;
1584
1585         if (state == VGA_SWITCHEROO_ON) {
1586                 pr_info("switched on\n");
1587                 /* don't suspend or resume card normally */
1588                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1589
1590                 pci_set_power_state(pdev, PCI_D0);
1591                 amdgpu_device_load_pci_state(pdev);
1592                 r = pci_enable_device(pdev);
1593                 if (r)
1594                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1595                 amdgpu_device_resume(dev, true);
1596
1597                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1598         } else {
1599                 pr_info("switched off\n");
1600                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1601                 amdgpu_device_suspend(dev, true);
1602                 amdgpu_device_cache_pci_state(pdev);
1603                 /* Shut down the device */
1604                 pci_disable_device(pdev);
1605                 pci_set_power_state(pdev, PCI_D3cold);
1606                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1607         }
1608 }
1609
1610 /**
1611  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1612  *
1613  * @pdev: pci dev pointer
1614  *
1615  * Callback for the switcheroo driver.  Check of the switcheroo
1616  * state can be changed.
1617  * Returns true if the state can be changed, false if not.
1618  */
1619 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1620 {
1621         struct drm_device *dev = pci_get_drvdata(pdev);
1622
1623         /*
1624         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1625         * locking inversion with the driver load path. And the access here is
1626         * completely racy anyway. So don't bother with locking for now.
1627         */
1628         return atomic_read(&dev->open_count) == 0;
1629 }
1630
1631 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1632         .set_gpu_state = amdgpu_switcheroo_set_state,
1633         .reprobe = NULL,
1634         .can_switch = amdgpu_switcheroo_can_switch,
1635 };
1636
1637 /**
1638  * amdgpu_device_ip_set_clockgating_state - set the CG state
1639  *
1640  * @dev: amdgpu_device pointer
1641  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1642  * @state: clockgating state (gate or ungate)
1643  *
1644  * Sets the requested clockgating state for all instances of
1645  * the hardware IP specified.
1646  * Returns the error code from the last instance.
1647  */
1648 int amdgpu_device_ip_set_clockgating_state(void *dev,
1649                                            enum amd_ip_block_type block_type,
1650                                            enum amd_clockgating_state state)
1651 {
1652         struct amdgpu_device *adev = dev;
1653         int i, r = 0;
1654
1655         for (i = 0; i < adev->num_ip_blocks; i++) {
1656                 if (!adev->ip_blocks[i].status.valid)
1657                         continue;
1658                 if (adev->ip_blocks[i].version->type != block_type)
1659                         continue;
1660                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1661                         continue;
1662                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1663                         (void *)adev, state);
1664                 if (r)
1665                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1666                                   adev->ip_blocks[i].version->funcs->name, r);
1667         }
1668         return r;
1669 }
1670
1671 /**
1672  * amdgpu_device_ip_set_powergating_state - set the PG state
1673  *
1674  * @dev: amdgpu_device pointer
1675  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1676  * @state: powergating state (gate or ungate)
1677  *
1678  * Sets the requested powergating state for all instances of
1679  * the hardware IP specified.
1680  * Returns the error code from the last instance.
1681  */
1682 int amdgpu_device_ip_set_powergating_state(void *dev,
1683                                            enum amd_ip_block_type block_type,
1684                                            enum amd_powergating_state state)
1685 {
1686         struct amdgpu_device *adev = dev;
1687         int i, r = 0;
1688
1689         for (i = 0; i < adev->num_ip_blocks; i++) {
1690                 if (!adev->ip_blocks[i].status.valid)
1691                         continue;
1692                 if (adev->ip_blocks[i].version->type != block_type)
1693                         continue;
1694                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1695                         continue;
1696                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1697                         (void *)adev, state);
1698                 if (r)
1699                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1700                                   adev->ip_blocks[i].version->funcs->name, r);
1701         }
1702         return r;
1703 }
1704
1705 /**
1706  * amdgpu_device_ip_get_clockgating_state - get the CG state
1707  *
1708  * @adev: amdgpu_device pointer
1709  * @flags: clockgating feature flags
1710  *
1711  * Walks the list of IPs on the device and updates the clockgating
1712  * flags for each IP.
1713  * Updates @flags with the feature flags for each hardware IP where
1714  * clockgating is enabled.
1715  */
1716 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1717                                             u64 *flags)
1718 {
1719         int i;
1720
1721         for (i = 0; i < adev->num_ip_blocks; i++) {
1722                 if (!adev->ip_blocks[i].status.valid)
1723                         continue;
1724                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1725                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1726         }
1727 }
1728
1729 /**
1730  * amdgpu_device_ip_wait_for_idle - wait for idle
1731  *
1732  * @adev: amdgpu_device pointer
1733  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1734  *
1735  * Waits for the request hardware IP to be idle.
1736  * Returns 0 for success or a negative error code on failure.
1737  */
1738 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1739                                    enum amd_ip_block_type block_type)
1740 {
1741         int i, r;
1742
1743         for (i = 0; i < adev->num_ip_blocks; i++) {
1744                 if (!adev->ip_blocks[i].status.valid)
1745                         continue;
1746                 if (adev->ip_blocks[i].version->type == block_type) {
1747                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1748                         if (r)
1749                                 return r;
1750                         break;
1751                 }
1752         }
1753         return 0;
1754
1755 }
1756
1757 /**
1758  * amdgpu_device_ip_is_idle - is the hardware IP idle
1759  *
1760  * @adev: amdgpu_device pointer
1761  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1762  *
1763  * Check if the hardware IP is idle or not.
1764  * Returns true if it the IP is idle, false if not.
1765  */
1766 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1767                               enum amd_ip_block_type block_type)
1768 {
1769         int i;
1770
1771         for (i = 0; i < adev->num_ip_blocks; i++) {
1772                 if (!adev->ip_blocks[i].status.valid)
1773                         continue;
1774                 if (adev->ip_blocks[i].version->type == block_type)
1775                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1776         }
1777         return true;
1778
1779 }
1780
1781 /**
1782  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1783  *
1784  * @adev: amdgpu_device pointer
1785  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1786  *
1787  * Returns a pointer to the hardware IP block structure
1788  * if it exists for the asic, otherwise NULL.
1789  */
1790 struct amdgpu_ip_block *
1791 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1792                               enum amd_ip_block_type type)
1793 {
1794         int i;
1795
1796         for (i = 0; i < adev->num_ip_blocks; i++)
1797                 if (adev->ip_blocks[i].version->type == type)
1798                         return &adev->ip_blocks[i];
1799
1800         return NULL;
1801 }
1802
1803 /**
1804  * amdgpu_device_ip_block_version_cmp
1805  *
1806  * @adev: amdgpu_device pointer
1807  * @type: enum amd_ip_block_type
1808  * @major: major version
1809  * @minor: minor version
1810  *
1811  * return 0 if equal or greater
1812  * return 1 if smaller or the ip_block doesn't exist
1813  */
1814 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1815                                        enum amd_ip_block_type type,
1816                                        u32 major, u32 minor)
1817 {
1818         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1819
1820         if (ip_block && ((ip_block->version->major > major) ||
1821                         ((ip_block->version->major == major) &&
1822                         (ip_block->version->minor >= minor))))
1823                 return 0;
1824
1825         return 1;
1826 }
1827
1828 /**
1829  * amdgpu_device_ip_block_add
1830  *
1831  * @adev: amdgpu_device pointer
1832  * @ip_block_version: pointer to the IP to add
1833  *
1834  * Adds the IP block driver information to the collection of IPs
1835  * on the asic.
1836  */
1837 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1838                                const struct amdgpu_ip_block_version *ip_block_version)
1839 {
1840         if (!ip_block_version)
1841                 return -EINVAL;
1842
1843         switch (ip_block_version->type) {
1844         case AMD_IP_BLOCK_TYPE_VCN:
1845                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1846                         return 0;
1847                 break;
1848         case AMD_IP_BLOCK_TYPE_JPEG:
1849                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1850                         return 0;
1851                 break;
1852         default:
1853                 break;
1854         }
1855
1856         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1857                   ip_block_version->funcs->name);
1858
1859         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1860
1861         return 0;
1862 }
1863
1864 /**
1865  * amdgpu_device_enable_virtual_display - enable virtual display feature
1866  *
1867  * @adev: amdgpu_device pointer
1868  *
1869  * Enabled the virtual display feature if the user has enabled it via
1870  * the module parameter virtual_display.  This feature provides a virtual
1871  * display hardware on headless boards or in virtualized environments.
1872  * This function parses and validates the configuration string specified by
1873  * the user and configues the virtual display configuration (number of
1874  * virtual connectors, crtcs, etc.) specified.
1875  */
1876 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1877 {
1878         adev->enable_virtual_display = false;
1879
1880         if (amdgpu_virtual_display) {
1881                 const char *pci_address_name = pci_name(adev->pdev);
1882                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1883
1884                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1885                 pciaddstr_tmp = pciaddstr;
1886                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1887                         pciaddname = strsep(&pciaddname_tmp, ",");
1888                         if (!strcmp("all", pciaddname)
1889                             || !strcmp(pci_address_name, pciaddname)) {
1890                                 long num_crtc;
1891                                 int res = -1;
1892
1893                                 adev->enable_virtual_display = true;
1894
1895                                 if (pciaddname_tmp)
1896                                         res = kstrtol(pciaddname_tmp, 10,
1897                                                       &num_crtc);
1898
1899                                 if (!res) {
1900                                         if (num_crtc < 1)
1901                                                 num_crtc = 1;
1902                                         if (num_crtc > 6)
1903                                                 num_crtc = 6;
1904                                         adev->mode_info.num_crtc = num_crtc;
1905                                 } else {
1906                                         adev->mode_info.num_crtc = 1;
1907                                 }
1908                                 break;
1909                         }
1910                 }
1911
1912                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1913                          amdgpu_virtual_display, pci_address_name,
1914                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1915
1916                 kfree(pciaddstr);
1917         }
1918 }
1919
1920 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1921 {
1922         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1923                 adev->mode_info.num_crtc = 1;
1924                 adev->enable_virtual_display = true;
1925                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1926                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1927         }
1928 }
1929
1930 /**
1931  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1932  *
1933  * @adev: amdgpu_device pointer
1934  *
1935  * Parses the asic configuration parameters specified in the gpu info
1936  * firmware and makes them availale to the driver for use in configuring
1937  * the asic.
1938  * Returns 0 on success, -EINVAL on failure.
1939  */
1940 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1941 {
1942         const char *chip_name;
1943         char fw_name[40];
1944         int err;
1945         const struct gpu_info_firmware_header_v1_0 *hdr;
1946
1947         adev->firmware.gpu_info_fw = NULL;
1948
1949         if (adev->mman.discovery_bin) {
1950                 /*
1951                  * FIXME: The bounding box is still needed by Navi12, so
1952                  * temporarily read it from gpu_info firmware. Should be dropped
1953                  * when DAL no longer needs it.
1954                  */
1955                 if (adev->asic_type != CHIP_NAVI12)
1956                         return 0;
1957         }
1958
1959         switch (adev->asic_type) {
1960         default:
1961                 return 0;
1962         case CHIP_VEGA10:
1963                 chip_name = "vega10";
1964                 break;
1965         case CHIP_VEGA12:
1966                 chip_name = "vega12";
1967                 break;
1968         case CHIP_RAVEN:
1969                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1970                         chip_name = "raven2";
1971                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1972                         chip_name = "picasso";
1973                 else
1974                         chip_name = "raven";
1975                 break;
1976         case CHIP_ARCTURUS:
1977                 chip_name = "arcturus";
1978                 break;
1979         case CHIP_NAVI12:
1980                 chip_name = "navi12";
1981                 break;
1982         }
1983
1984         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1985         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1986         if (err) {
1987                 dev_err(adev->dev,
1988                         "Failed to load gpu_info firmware \"%s\"\n",
1989                         fw_name);
1990                 goto out;
1991         }
1992         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1993         if (err) {
1994                 dev_err(adev->dev,
1995                         "Failed to validate gpu_info firmware \"%s\"\n",
1996                         fw_name);
1997                 goto out;
1998         }
1999
2000         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2001         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2002
2003         switch (hdr->version_major) {
2004         case 1:
2005         {
2006                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2007                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2008                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2009
2010                 /*
2011                  * Should be droped when DAL no longer needs it.
2012                  */
2013                 if (adev->asic_type == CHIP_NAVI12)
2014                         goto parse_soc_bounding_box;
2015
2016                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2017                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2018                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2019                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2020                 adev->gfx.config.max_texture_channel_caches =
2021                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2022                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2023                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2024                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2025                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2026                 adev->gfx.config.double_offchip_lds_buf =
2027                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2028                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2029                 adev->gfx.cu_info.max_waves_per_simd =
2030                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2031                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2032                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2033                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2034                 if (hdr->version_minor >= 1) {
2035                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2036                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2037                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2038                         adev->gfx.config.num_sc_per_sh =
2039                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2040                         adev->gfx.config.num_packer_per_sc =
2041                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2042                 }
2043
2044 parse_soc_bounding_box:
2045                 /*
2046                  * soc bounding box info is not integrated in disocovery table,
2047                  * we always need to parse it from gpu info firmware if needed.
2048                  */
2049                 if (hdr->version_minor == 2) {
2050                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2051                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2052                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2053                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2054                 }
2055                 break;
2056         }
2057         default:
2058                 dev_err(adev->dev,
2059                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2060                 err = -EINVAL;
2061                 goto out;
2062         }
2063 out:
2064         return err;
2065 }
2066
2067 /**
2068  * amdgpu_device_ip_early_init - run early init for hardware IPs
2069  *
2070  * @adev: amdgpu_device pointer
2071  *
2072  * Early initialization pass for hardware IPs.  The hardware IPs that make
2073  * up each asic are discovered each IP's early_init callback is run.  This
2074  * is the first stage in initializing the asic.
2075  * Returns 0 on success, negative error code on failure.
2076  */
2077 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2078 {
2079         struct drm_device *dev = adev_to_drm(adev);
2080         struct pci_dev *parent;
2081         int i, r;
2082
2083         amdgpu_device_enable_virtual_display(adev);
2084
2085         if (amdgpu_sriov_vf(adev)) {
2086                 r = amdgpu_virt_request_full_gpu(adev, true);
2087                 if (r)
2088                         return r;
2089         }
2090
2091         switch (adev->asic_type) {
2092 #ifdef CONFIG_DRM_AMDGPU_SI
2093         case CHIP_VERDE:
2094         case CHIP_TAHITI:
2095         case CHIP_PITCAIRN:
2096         case CHIP_OLAND:
2097         case CHIP_HAINAN:
2098                 adev->family = AMDGPU_FAMILY_SI;
2099                 r = si_set_ip_blocks(adev);
2100                 if (r)
2101                         return r;
2102                 break;
2103 #endif
2104 #ifdef CONFIG_DRM_AMDGPU_CIK
2105         case CHIP_BONAIRE:
2106         case CHIP_HAWAII:
2107         case CHIP_KAVERI:
2108         case CHIP_KABINI:
2109         case CHIP_MULLINS:
2110                 if (adev->flags & AMD_IS_APU)
2111                         adev->family = AMDGPU_FAMILY_KV;
2112                 else
2113                         adev->family = AMDGPU_FAMILY_CI;
2114
2115                 r = cik_set_ip_blocks(adev);
2116                 if (r)
2117                         return r;
2118                 break;
2119 #endif
2120         case CHIP_TOPAZ:
2121         case CHIP_TONGA:
2122         case CHIP_FIJI:
2123         case CHIP_POLARIS10:
2124         case CHIP_POLARIS11:
2125         case CHIP_POLARIS12:
2126         case CHIP_VEGAM:
2127         case CHIP_CARRIZO:
2128         case CHIP_STONEY:
2129                 if (adev->flags & AMD_IS_APU)
2130                         adev->family = AMDGPU_FAMILY_CZ;
2131                 else
2132                         adev->family = AMDGPU_FAMILY_VI;
2133
2134                 r = vi_set_ip_blocks(adev);
2135                 if (r)
2136                         return r;
2137                 break;
2138         default:
2139                 r = amdgpu_discovery_set_ip_blocks(adev);
2140                 if (r)
2141                         return r;
2142                 break;
2143         }
2144
2145         if (amdgpu_has_atpx() &&
2146             (amdgpu_is_atpx_hybrid() ||
2147              amdgpu_has_atpx_dgpu_power_cntl()) &&
2148             ((adev->flags & AMD_IS_APU) == 0) &&
2149             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2150                 adev->flags |= AMD_IS_PX;
2151
2152         if (!(adev->flags & AMD_IS_APU)) {
2153                 parent = pci_upstream_bridge(adev->pdev);
2154                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2155         }
2156
2157         amdgpu_amdkfd_device_probe(adev);
2158
2159         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2160         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2161                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2162         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2163                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2164
2165         for (i = 0; i < adev->num_ip_blocks; i++) {
2166                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2167                         DRM_ERROR("disabled ip block: %d <%s>\n",
2168                                   i, adev->ip_blocks[i].version->funcs->name);
2169                         adev->ip_blocks[i].status.valid = false;
2170                 } else {
2171                         if (adev->ip_blocks[i].version->funcs->early_init) {
2172                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2173                                 if (r == -ENOENT) {
2174                                         adev->ip_blocks[i].status.valid = false;
2175                                 } else if (r) {
2176                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2177                                                   adev->ip_blocks[i].version->funcs->name, r);
2178                                         return r;
2179                                 } else {
2180                                         adev->ip_blocks[i].status.valid = true;
2181                                 }
2182                         } else {
2183                                 adev->ip_blocks[i].status.valid = true;
2184                         }
2185                 }
2186                 /* get the vbios after the asic_funcs are set up */
2187                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2188                         r = amdgpu_device_parse_gpu_info_fw(adev);
2189                         if (r)
2190                                 return r;
2191
2192                         /* Read BIOS */
2193                         if (!amdgpu_get_bios(adev))
2194                                 return -EINVAL;
2195
2196                         r = amdgpu_atombios_init(adev);
2197                         if (r) {
2198                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2199                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2200                                 return r;
2201                         }
2202
2203                         /*get pf2vf msg info at it's earliest time*/
2204                         if (amdgpu_sriov_vf(adev))
2205                                 amdgpu_virt_init_data_exchange(adev);
2206
2207                 }
2208         }
2209
2210         adev->cg_flags &= amdgpu_cg_mask;
2211         adev->pg_flags &= amdgpu_pg_mask;
2212
2213         return 0;
2214 }
2215
2216 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2217 {
2218         int i, r;
2219
2220         for (i = 0; i < adev->num_ip_blocks; i++) {
2221                 if (!adev->ip_blocks[i].status.sw)
2222                         continue;
2223                 if (adev->ip_blocks[i].status.hw)
2224                         continue;
2225                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2226                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2227                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2228                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2229                         if (r) {
2230                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2231                                           adev->ip_blocks[i].version->funcs->name, r);
2232                                 return r;
2233                         }
2234                         adev->ip_blocks[i].status.hw = true;
2235                 }
2236         }
2237
2238         return 0;
2239 }
2240
2241 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2242 {
2243         int i, r;
2244
2245         for (i = 0; i < adev->num_ip_blocks; i++) {
2246                 if (!adev->ip_blocks[i].status.sw)
2247                         continue;
2248                 if (adev->ip_blocks[i].status.hw)
2249                         continue;
2250                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2251                 if (r) {
2252                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2253                                   adev->ip_blocks[i].version->funcs->name, r);
2254                         return r;
2255                 }
2256                 adev->ip_blocks[i].status.hw = true;
2257         }
2258
2259         return 0;
2260 }
2261
2262 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2263 {
2264         int r = 0;
2265         int i;
2266         uint32_t smu_version;
2267
2268         if (adev->asic_type >= CHIP_VEGA10) {
2269                 for (i = 0; i < adev->num_ip_blocks; i++) {
2270                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2271                                 continue;
2272
2273                         if (!adev->ip_blocks[i].status.sw)
2274                                 continue;
2275
2276                         /* no need to do the fw loading again if already done*/
2277                         if (adev->ip_blocks[i].status.hw == true)
2278                                 break;
2279
2280                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2281                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2282                                 if (r) {
2283                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2284                                                           adev->ip_blocks[i].version->funcs->name, r);
2285                                         return r;
2286                                 }
2287                         } else {
2288                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2289                                 if (r) {
2290                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2291                                                           adev->ip_blocks[i].version->funcs->name, r);
2292                                         return r;
2293                                 }
2294                         }
2295
2296                         adev->ip_blocks[i].status.hw = true;
2297                         break;
2298                 }
2299         }
2300
2301         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2302                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2303
2304         return r;
2305 }
2306
2307 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2308 {
2309         long timeout;
2310         int r, i;
2311
2312         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2313                 struct amdgpu_ring *ring = adev->rings[i];
2314
2315                 /* No need to setup the GPU scheduler for rings that don't need it */
2316                 if (!ring || ring->no_scheduler)
2317                         continue;
2318
2319                 switch (ring->funcs->type) {
2320                 case AMDGPU_RING_TYPE_GFX:
2321                         timeout = adev->gfx_timeout;
2322                         break;
2323                 case AMDGPU_RING_TYPE_COMPUTE:
2324                         timeout = adev->compute_timeout;
2325                         break;
2326                 case AMDGPU_RING_TYPE_SDMA:
2327                         timeout = adev->sdma_timeout;
2328                         break;
2329                 default:
2330                         timeout = adev->video_timeout;
2331                         break;
2332                 }
2333
2334                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2335                                    ring->num_hw_submission, amdgpu_job_hang_limit,
2336                                    timeout, adev->reset_domain->wq,
2337                                    ring->sched_score, ring->name,
2338                                    adev->dev);
2339                 if (r) {
2340                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2341                                   ring->name);
2342                         return r;
2343                 }
2344         }
2345
2346         return 0;
2347 }
2348
2349
2350 /**
2351  * amdgpu_device_ip_init - run init for hardware IPs
2352  *
2353  * @adev: amdgpu_device pointer
2354  *
2355  * Main initialization pass for hardware IPs.  The list of all the hardware
2356  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2357  * are run.  sw_init initializes the software state associated with each IP
2358  * and hw_init initializes the hardware associated with each IP.
2359  * Returns 0 on success, negative error code on failure.
2360  */
2361 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2362 {
2363         int i, r;
2364
2365         r = amdgpu_ras_init(adev);
2366         if (r)
2367                 return r;
2368
2369         for (i = 0; i < adev->num_ip_blocks; i++) {
2370                 if (!adev->ip_blocks[i].status.valid)
2371                         continue;
2372                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2373                 if (r) {
2374                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2375                                   adev->ip_blocks[i].version->funcs->name, r);
2376                         goto init_failed;
2377                 }
2378                 adev->ip_blocks[i].status.sw = true;
2379
2380                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2381                         /* need to do common hw init early so everything is set up for gmc */
2382                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2383                         if (r) {
2384                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2385                                 goto init_failed;
2386                         }
2387                         adev->ip_blocks[i].status.hw = true;
2388                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2389                         /* need to do gmc hw init early so we can allocate gpu mem */
2390                         /* Try to reserve bad pages early */
2391                         if (amdgpu_sriov_vf(adev))
2392                                 amdgpu_virt_exchange_data(adev);
2393
2394                         r = amdgpu_device_vram_scratch_init(adev);
2395                         if (r) {
2396                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2397                                 goto init_failed;
2398                         }
2399                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2400                         if (r) {
2401                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2402                                 goto init_failed;
2403                         }
2404                         r = amdgpu_device_wb_init(adev);
2405                         if (r) {
2406                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2407                                 goto init_failed;
2408                         }
2409                         adev->ip_blocks[i].status.hw = true;
2410
2411                         /* right after GMC hw init, we create CSA */
2412                         if (amdgpu_mcbp) {
2413                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2414                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2415                                                                 AMDGPU_CSA_SIZE);
2416                                 if (r) {
2417                                         DRM_ERROR("allocate CSA failed %d\n", r);
2418                                         goto init_failed;
2419                                 }
2420                         }
2421                 }
2422         }
2423
2424         if (amdgpu_sriov_vf(adev))
2425                 amdgpu_virt_init_data_exchange(adev);
2426
2427         r = amdgpu_ib_pool_init(adev);
2428         if (r) {
2429                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2430                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2431                 goto init_failed;
2432         }
2433
2434         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2435         if (r)
2436                 goto init_failed;
2437
2438         r = amdgpu_device_ip_hw_init_phase1(adev);
2439         if (r)
2440                 goto init_failed;
2441
2442         r = amdgpu_device_fw_loading(adev);
2443         if (r)
2444                 goto init_failed;
2445
2446         r = amdgpu_device_ip_hw_init_phase2(adev);
2447         if (r)
2448                 goto init_failed;
2449
2450         /*
2451          * retired pages will be loaded from eeprom and reserved here,
2452          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2453          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2454          * for I2C communication which only true at this point.
2455          *
2456          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2457          * failure from bad gpu situation and stop amdgpu init process
2458          * accordingly. For other failed cases, it will still release all
2459          * the resource and print error message, rather than returning one
2460          * negative value to upper level.
2461          *
2462          * Note: theoretically, this should be called before all vram allocations
2463          * to protect retired page from abusing
2464          */
2465         r = amdgpu_ras_recovery_init(adev);
2466         if (r)
2467                 goto init_failed;
2468
2469         /**
2470          * In case of XGMI grab extra reference for reset domain for this device
2471          */
2472         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2473                 if (amdgpu_xgmi_add_device(adev) == 0) {
2474                         if (!amdgpu_sriov_vf(adev)) {
2475                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2476
2477                                 if (!hive->reset_domain ||
2478                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2479                                         r = -ENOENT;
2480                                         amdgpu_put_xgmi_hive(hive);
2481                                         goto init_failed;
2482                                 }
2483
2484                                 /* Drop the early temporary reset domain we created for device */
2485                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2486                                 adev->reset_domain = hive->reset_domain;
2487                                 amdgpu_put_xgmi_hive(hive);
2488                         }
2489                 }
2490         }
2491
2492         r = amdgpu_device_init_schedulers(adev);
2493         if (r)
2494                 goto init_failed;
2495
2496         /* Don't init kfd if whole hive need to be reset during init */
2497         if (!adev->gmc.xgmi.pending_reset)
2498                 amdgpu_amdkfd_device_init(adev);
2499
2500         amdgpu_fru_get_product_info(adev);
2501
2502 init_failed:
2503         if (amdgpu_sriov_vf(adev))
2504                 amdgpu_virt_release_full_gpu(adev, true);
2505
2506         return r;
2507 }
2508
2509 /**
2510  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2511  *
2512  * @adev: amdgpu_device pointer
2513  *
2514  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2515  * this function before a GPU reset.  If the value is retained after a
2516  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2517  */
2518 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2519 {
2520         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2521 }
2522
2523 /**
2524  * amdgpu_device_check_vram_lost - check if vram is valid
2525  *
2526  * @adev: amdgpu_device pointer
2527  *
2528  * Checks the reset magic value written to the gart pointer in VRAM.
2529  * The driver calls this after a GPU reset to see if the contents of
2530  * VRAM is lost or now.
2531  * returns true if vram is lost, false if not.
2532  */
2533 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2534 {
2535         if (memcmp(adev->gart.ptr, adev->reset_magic,
2536                         AMDGPU_RESET_MAGIC_NUM))
2537                 return true;
2538
2539         if (!amdgpu_in_reset(adev))
2540                 return false;
2541
2542         /*
2543          * For all ASICs with baco/mode1 reset, the VRAM is
2544          * always assumed to be lost.
2545          */
2546         switch (amdgpu_asic_reset_method(adev)) {
2547         case AMD_RESET_METHOD_BACO:
2548         case AMD_RESET_METHOD_MODE1:
2549                 return true;
2550         default:
2551                 return false;
2552         }
2553 }
2554
2555 /**
2556  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2557  *
2558  * @adev: amdgpu_device pointer
2559  * @state: clockgating state (gate or ungate)
2560  *
2561  * The list of all the hardware IPs that make up the asic is walked and the
2562  * set_clockgating_state callbacks are run.
2563  * Late initialization pass enabling clockgating for hardware IPs.
2564  * Fini or suspend, pass disabling clockgating for hardware IPs.
2565  * Returns 0 on success, negative error code on failure.
2566  */
2567
2568 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2569                                enum amd_clockgating_state state)
2570 {
2571         int i, j, r;
2572
2573         if (amdgpu_emu_mode == 1)
2574                 return 0;
2575
2576         for (j = 0; j < adev->num_ip_blocks; j++) {
2577                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2578                 if (!adev->ip_blocks[i].status.late_initialized)
2579                         continue;
2580                 /* skip CG for GFX on S0ix */
2581                 if (adev->in_s0ix &&
2582                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2583                         continue;
2584                 /* skip CG for VCE/UVD, it's handled specially */
2585                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2586                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2587                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2588                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2589                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2590                         /* enable clockgating to save power */
2591                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2592                                                                                      state);
2593                         if (r) {
2594                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2595                                           adev->ip_blocks[i].version->funcs->name, r);
2596                                 return r;
2597                         }
2598                 }
2599         }
2600
2601         return 0;
2602 }
2603
2604 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2605                                enum amd_powergating_state state)
2606 {
2607         int i, j, r;
2608
2609         if (amdgpu_emu_mode == 1)
2610                 return 0;
2611
2612         for (j = 0; j < adev->num_ip_blocks; j++) {
2613                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2614                 if (!adev->ip_blocks[i].status.late_initialized)
2615                         continue;
2616                 /* skip PG for GFX on S0ix */
2617                 if (adev->in_s0ix &&
2618                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2619                         continue;
2620                 /* skip CG for VCE/UVD, it's handled specially */
2621                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2622                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2623                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2624                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2625                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2626                         /* enable powergating to save power */
2627                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2628                                                                                         state);
2629                         if (r) {
2630                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2631                                           adev->ip_blocks[i].version->funcs->name, r);
2632                                 return r;
2633                         }
2634                 }
2635         }
2636         return 0;
2637 }
2638
2639 static int amdgpu_device_enable_mgpu_fan_boost(void)
2640 {
2641         struct amdgpu_gpu_instance *gpu_ins;
2642         struct amdgpu_device *adev;
2643         int i, ret = 0;
2644
2645         mutex_lock(&mgpu_info.mutex);
2646
2647         /*
2648          * MGPU fan boost feature should be enabled
2649          * only when there are two or more dGPUs in
2650          * the system
2651          */
2652         if (mgpu_info.num_dgpu < 2)
2653                 goto out;
2654
2655         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2656                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2657                 adev = gpu_ins->adev;
2658                 if (!(adev->flags & AMD_IS_APU) &&
2659                     !gpu_ins->mgpu_fan_enabled) {
2660                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2661                         if (ret)
2662                                 break;
2663
2664                         gpu_ins->mgpu_fan_enabled = 1;
2665                 }
2666         }
2667
2668 out:
2669         mutex_unlock(&mgpu_info.mutex);
2670
2671         return ret;
2672 }
2673
2674 /**
2675  * amdgpu_device_ip_late_init - run late init for hardware IPs
2676  *
2677  * @adev: amdgpu_device pointer
2678  *
2679  * Late initialization pass for hardware IPs.  The list of all the hardware
2680  * IPs that make up the asic is walked and the late_init callbacks are run.
2681  * late_init covers any special initialization that an IP requires
2682  * after all of the have been initialized or something that needs to happen
2683  * late in the init process.
2684  * Returns 0 on success, negative error code on failure.
2685  */
2686 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2687 {
2688         struct amdgpu_gpu_instance *gpu_instance;
2689         int i = 0, r;
2690
2691         for (i = 0; i < adev->num_ip_blocks; i++) {
2692                 if (!adev->ip_blocks[i].status.hw)
2693                         continue;
2694                 if (adev->ip_blocks[i].version->funcs->late_init) {
2695                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2696                         if (r) {
2697                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2698                                           adev->ip_blocks[i].version->funcs->name, r);
2699                                 return r;
2700                         }
2701                 }
2702                 adev->ip_blocks[i].status.late_initialized = true;
2703         }
2704
2705         r = amdgpu_ras_late_init(adev);
2706         if (r) {
2707                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2708                 return r;
2709         }
2710
2711         amdgpu_ras_set_error_query_ready(adev, true);
2712
2713         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2714         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2715
2716         amdgpu_device_fill_reset_magic(adev);
2717
2718         r = amdgpu_device_enable_mgpu_fan_boost();
2719         if (r)
2720                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2721
2722         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2723         if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2724                                adev->asic_type == CHIP_ALDEBARAN ))
2725                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2726
2727         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2728                 mutex_lock(&mgpu_info.mutex);
2729
2730                 /*
2731                  * Reset device p-state to low as this was booted with high.
2732                  *
2733                  * This should be performed only after all devices from the same
2734                  * hive get initialized.
2735                  *
2736                  * However, it's unknown how many device in the hive in advance.
2737                  * As this is counted one by one during devices initializations.
2738                  *
2739                  * So, we wait for all XGMI interlinked devices initialized.
2740                  * This may bring some delays as those devices may come from
2741                  * different hives. But that should be OK.
2742                  */
2743                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2744                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2745                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2746                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2747                                         continue;
2748
2749                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2750                                                 AMDGPU_XGMI_PSTATE_MIN);
2751                                 if (r) {
2752                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2753                                         break;
2754                                 }
2755                         }
2756                 }
2757
2758                 mutex_unlock(&mgpu_info.mutex);
2759         }
2760
2761         return 0;
2762 }
2763
2764 /**
2765  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2766  *
2767  * @adev: amdgpu_device pointer
2768  *
2769  * For ASICs need to disable SMC first
2770  */
2771 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2772 {
2773         int i, r;
2774
2775         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2776                 return;
2777
2778         for (i = 0; i < adev->num_ip_blocks; i++) {
2779                 if (!adev->ip_blocks[i].status.hw)
2780                         continue;
2781                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2782                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2783                         /* XXX handle errors */
2784                         if (r) {
2785                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2786                                           adev->ip_blocks[i].version->funcs->name, r);
2787                         }
2788                         adev->ip_blocks[i].status.hw = false;
2789                         break;
2790                 }
2791         }
2792 }
2793
2794 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2795 {
2796         int i, r;
2797
2798         for (i = 0; i < adev->num_ip_blocks; i++) {
2799                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2800                         continue;
2801
2802                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2803                 if (r) {
2804                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2805                                   adev->ip_blocks[i].version->funcs->name, r);
2806                 }
2807         }
2808
2809         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2810         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2811
2812         amdgpu_amdkfd_suspend(adev, false);
2813
2814         /* Workaroud for ASICs need to disable SMC first */
2815         amdgpu_device_smu_fini_early(adev);
2816
2817         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2818                 if (!adev->ip_blocks[i].status.hw)
2819                         continue;
2820
2821                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2822                 /* XXX handle errors */
2823                 if (r) {
2824                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2825                                   adev->ip_blocks[i].version->funcs->name, r);
2826                 }
2827
2828                 adev->ip_blocks[i].status.hw = false;
2829         }
2830
2831         if (amdgpu_sriov_vf(adev)) {
2832                 if (amdgpu_virt_release_full_gpu(adev, false))
2833                         DRM_ERROR("failed to release exclusive mode on fini\n");
2834         }
2835
2836         return 0;
2837 }
2838
2839 /**
2840  * amdgpu_device_ip_fini - run fini for hardware IPs
2841  *
2842  * @adev: amdgpu_device pointer
2843  *
2844  * Main teardown pass for hardware IPs.  The list of all the hardware
2845  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2846  * are run.  hw_fini tears down the hardware associated with each IP
2847  * and sw_fini tears down any software state associated with each IP.
2848  * Returns 0 on success, negative error code on failure.
2849  */
2850 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2851 {
2852         int i, r;
2853
2854         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2855                 amdgpu_virt_release_ras_err_handler_data(adev);
2856
2857         if (adev->gmc.xgmi.num_physical_nodes > 1)
2858                 amdgpu_xgmi_remove_device(adev);
2859
2860         amdgpu_amdkfd_device_fini_sw(adev);
2861
2862         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2863                 if (!adev->ip_blocks[i].status.sw)
2864                         continue;
2865
2866                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2867                         amdgpu_ucode_free_bo(adev);
2868                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2869                         amdgpu_device_wb_fini(adev);
2870                         amdgpu_device_vram_scratch_fini(adev);
2871                         amdgpu_ib_pool_fini(adev);
2872                 }
2873
2874                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2875                 /* XXX handle errors */
2876                 if (r) {
2877                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2878                                   adev->ip_blocks[i].version->funcs->name, r);
2879                 }
2880                 adev->ip_blocks[i].status.sw = false;
2881                 adev->ip_blocks[i].status.valid = false;
2882         }
2883
2884         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2885                 if (!adev->ip_blocks[i].status.late_initialized)
2886                         continue;
2887                 if (adev->ip_blocks[i].version->funcs->late_fini)
2888                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2889                 adev->ip_blocks[i].status.late_initialized = false;
2890         }
2891
2892         amdgpu_ras_fini(adev);
2893
2894         return 0;
2895 }
2896
2897 /**
2898  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2899  *
2900  * @work: work_struct.
2901  */
2902 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2903 {
2904         struct amdgpu_device *adev =
2905                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2906         int r;
2907
2908         r = amdgpu_ib_ring_tests(adev);
2909         if (r)
2910                 DRM_ERROR("ib ring test failed (%d).\n", r);
2911 }
2912
2913 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2914 {
2915         struct amdgpu_device *adev =
2916                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2917
2918         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2919         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2920
2921         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2922                 adev->gfx.gfx_off_state = true;
2923 }
2924
2925 /**
2926  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2927  *
2928  * @adev: amdgpu_device pointer
2929  *
2930  * Main suspend function for hardware IPs.  The list of all the hardware
2931  * IPs that make up the asic is walked, clockgating is disabled and the
2932  * suspend callbacks are run.  suspend puts the hardware and software state
2933  * in each IP into a state suitable for suspend.
2934  * Returns 0 on success, negative error code on failure.
2935  */
2936 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2937 {
2938         int i, r;
2939
2940         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2941         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2942
2943         /*
2944          * Per PMFW team's suggestion, driver needs to handle gfxoff
2945          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2946          * scenario. Add the missing df cstate disablement here.
2947          */
2948         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2949                 dev_warn(adev->dev, "Failed to disallow df cstate");
2950
2951         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2952                 if (!adev->ip_blocks[i].status.valid)
2953                         continue;
2954
2955                 /* displays are handled separately */
2956                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2957                         continue;
2958
2959                 /* XXX handle errors */
2960                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2961                 /* XXX handle errors */
2962                 if (r) {
2963                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2964                                   adev->ip_blocks[i].version->funcs->name, r);
2965                         return r;
2966                 }
2967
2968                 adev->ip_blocks[i].status.hw = false;
2969         }
2970
2971         return 0;
2972 }
2973
2974 /**
2975  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2976  *
2977  * @adev: amdgpu_device pointer
2978  *
2979  * Main suspend function for hardware IPs.  The list of all the hardware
2980  * IPs that make up the asic is walked, clockgating is disabled and the
2981  * suspend callbacks are run.  suspend puts the hardware and software state
2982  * in each IP into a state suitable for suspend.
2983  * Returns 0 on success, negative error code on failure.
2984  */
2985 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2986 {
2987         int i, r;
2988
2989         if (adev->in_s0ix)
2990                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2991
2992         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2993                 if (!adev->ip_blocks[i].status.valid)
2994                         continue;
2995                 /* displays are handled in phase1 */
2996                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2997                         continue;
2998                 /* PSP lost connection when err_event_athub occurs */
2999                 if (amdgpu_ras_intr_triggered() &&
3000                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3001                         adev->ip_blocks[i].status.hw = false;
3002                         continue;
3003                 }
3004
3005                 /* skip unnecessary suspend if we do not initialize them yet */
3006                 if (adev->gmc.xgmi.pending_reset &&
3007                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3008                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3009                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3010                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3011                         adev->ip_blocks[i].status.hw = false;
3012                         continue;
3013                 }
3014
3015                 /* skip suspend of gfx and psp for S0ix
3016                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3017                  * like at runtime. PSP is also part of the always on hardware
3018                  * so no need to suspend it.
3019                  */
3020                 if (adev->in_s0ix &&
3021                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3022                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
3023                         continue;
3024
3025                 /* XXX handle errors */
3026                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3027                 /* XXX handle errors */
3028                 if (r) {
3029                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3030                                   adev->ip_blocks[i].version->funcs->name, r);
3031                 }
3032                 adev->ip_blocks[i].status.hw = false;
3033                 /* handle putting the SMC in the appropriate state */
3034                 if(!amdgpu_sriov_vf(adev)){
3035                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3036                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3037                                 if (r) {
3038                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3039                                                         adev->mp1_state, r);
3040                                         return r;
3041                                 }
3042                         }
3043                 }
3044         }
3045
3046         return 0;
3047 }
3048
3049 /**
3050  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3051  *
3052  * @adev: amdgpu_device pointer
3053  *
3054  * Main suspend function for hardware IPs.  The list of all the hardware
3055  * IPs that make up the asic is walked, clockgating is disabled and the
3056  * suspend callbacks are run.  suspend puts the hardware and software state
3057  * in each IP into a state suitable for suspend.
3058  * Returns 0 on success, negative error code on failure.
3059  */
3060 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3061 {
3062         int r;
3063
3064         if (amdgpu_sriov_vf(adev)) {
3065                 amdgpu_virt_fini_data_exchange(adev);
3066                 amdgpu_virt_request_full_gpu(adev, false);
3067         }
3068
3069         r = amdgpu_device_ip_suspend_phase1(adev);
3070         if (r)
3071                 return r;
3072         r = amdgpu_device_ip_suspend_phase2(adev);
3073
3074         if (amdgpu_sriov_vf(adev))
3075                 amdgpu_virt_release_full_gpu(adev, false);
3076
3077         return r;
3078 }
3079
3080 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3081 {
3082         int i, r;
3083
3084         static enum amd_ip_block_type ip_order[] = {
3085                 AMD_IP_BLOCK_TYPE_COMMON,
3086                 AMD_IP_BLOCK_TYPE_GMC,
3087                 AMD_IP_BLOCK_TYPE_PSP,
3088                 AMD_IP_BLOCK_TYPE_IH,
3089         };
3090
3091         for (i = 0; i < adev->num_ip_blocks; i++) {
3092                 int j;
3093                 struct amdgpu_ip_block *block;
3094
3095                 block = &adev->ip_blocks[i];
3096                 block->status.hw = false;
3097
3098                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3099
3100                         if (block->version->type != ip_order[j] ||
3101                                 !block->status.valid)
3102                                 continue;
3103
3104                         r = block->version->funcs->hw_init(adev);
3105                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3106                         if (r)
3107                                 return r;
3108                         block->status.hw = true;
3109                 }
3110         }
3111
3112         return 0;
3113 }
3114
3115 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3116 {
3117         int i, r;
3118
3119         static enum amd_ip_block_type ip_order[] = {
3120                 AMD_IP_BLOCK_TYPE_SMC,
3121                 AMD_IP_BLOCK_TYPE_DCE,
3122                 AMD_IP_BLOCK_TYPE_GFX,
3123                 AMD_IP_BLOCK_TYPE_SDMA,
3124                 AMD_IP_BLOCK_TYPE_UVD,
3125                 AMD_IP_BLOCK_TYPE_VCE,
3126                 AMD_IP_BLOCK_TYPE_VCN
3127         };
3128
3129         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3130                 int j;
3131                 struct amdgpu_ip_block *block;
3132
3133                 for (j = 0; j < adev->num_ip_blocks; j++) {
3134                         block = &adev->ip_blocks[j];
3135
3136                         if (block->version->type != ip_order[i] ||
3137                                 !block->status.valid ||
3138                                 block->status.hw)
3139                                 continue;
3140
3141                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3142                                 r = block->version->funcs->resume(adev);
3143                         else
3144                                 r = block->version->funcs->hw_init(adev);
3145
3146                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3147                         if (r)
3148                                 return r;
3149                         block->status.hw = true;
3150                 }
3151         }
3152
3153         return 0;
3154 }
3155
3156 /**
3157  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3158  *
3159  * @adev: amdgpu_device pointer
3160  *
3161  * First resume function for hardware IPs.  The list of all the hardware
3162  * IPs that make up the asic is walked and the resume callbacks are run for
3163  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3164  * after a suspend and updates the software state as necessary.  This
3165  * function is also used for restoring the GPU after a GPU reset.
3166  * Returns 0 on success, negative error code on failure.
3167  */
3168 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3169 {
3170         int i, r;
3171
3172         for (i = 0; i < adev->num_ip_blocks; i++) {
3173                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3174                         continue;
3175                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3176                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3177                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3178                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3179
3180                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3181                         if (r) {
3182                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3183                                           adev->ip_blocks[i].version->funcs->name, r);
3184                                 return r;
3185                         }
3186                         adev->ip_blocks[i].status.hw = true;
3187                 }
3188         }
3189
3190         return 0;
3191 }
3192
3193 /**
3194  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3195  *
3196  * @adev: amdgpu_device pointer
3197  *
3198  * First resume function for hardware IPs.  The list of all the hardware
3199  * IPs that make up the asic is walked and the resume callbacks are run for
3200  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3201  * functional state after a suspend and updates the software state as
3202  * necessary.  This function is also used for restoring the GPU after a GPU
3203  * reset.
3204  * Returns 0 on success, negative error code on failure.
3205  */
3206 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3207 {
3208         int i, r;
3209
3210         for (i = 0; i < adev->num_ip_blocks; i++) {
3211                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3212                         continue;
3213                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3214                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3215                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3216                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3217                         continue;
3218                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3219                 if (r) {
3220                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3221                                   adev->ip_blocks[i].version->funcs->name, r);
3222                         return r;
3223                 }
3224                 adev->ip_blocks[i].status.hw = true;
3225
3226                 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3227                         /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3228                          * amdgpu_device_resume() after IP resume.
3229                          */
3230                         amdgpu_gfx_off_ctrl(adev, false);
3231                         DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3232                 }
3233
3234         }
3235
3236         return 0;
3237 }
3238
3239 /**
3240  * amdgpu_device_ip_resume - run resume for hardware IPs
3241  *
3242  * @adev: amdgpu_device pointer
3243  *
3244  * Main resume function for hardware IPs.  The hardware IPs
3245  * are split into two resume functions because they are
3246  * are also used in in recovering from a GPU reset and some additional
3247  * steps need to be take between them.  In this case (S3/S4) they are
3248  * run sequentially.
3249  * Returns 0 on success, negative error code on failure.
3250  */
3251 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3252 {
3253         int r;
3254
3255         r = amdgpu_amdkfd_resume_iommu(adev);
3256         if (r)
3257                 return r;
3258
3259         r = amdgpu_device_ip_resume_phase1(adev);
3260         if (r)
3261                 return r;
3262
3263         r = amdgpu_device_fw_loading(adev);
3264         if (r)
3265                 return r;
3266
3267         r = amdgpu_device_ip_resume_phase2(adev);
3268
3269         return r;
3270 }
3271
3272 /**
3273  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3274  *
3275  * @adev: amdgpu_device pointer
3276  *
3277  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3278  */
3279 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3280 {
3281         if (amdgpu_sriov_vf(adev)) {
3282                 if (adev->is_atom_fw) {
3283                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3284                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3285                 } else {
3286                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3287                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3288                 }
3289
3290                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3291                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3292         }
3293 }
3294
3295 /**
3296  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3297  *
3298  * @asic_type: AMD asic type
3299  *
3300  * Check if there is DC (new modesetting infrastructre) support for an asic.
3301  * returns true if DC has support, false if not.
3302  */
3303 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3304 {
3305         switch (asic_type) {
3306 #ifdef CONFIG_DRM_AMDGPU_SI
3307         case CHIP_HAINAN:
3308 #endif
3309         case CHIP_TOPAZ:
3310                 /* chips with no display hardware */
3311                 return false;
3312 #if defined(CONFIG_DRM_AMD_DC)
3313         case CHIP_TAHITI:
3314         case CHIP_PITCAIRN:
3315         case CHIP_VERDE:
3316         case CHIP_OLAND:
3317                 /*
3318                  * We have systems in the wild with these ASICs that require
3319                  * LVDS and VGA support which is not supported with DC.
3320                  *
3321                  * Fallback to the non-DC driver here by default so as not to
3322                  * cause regressions.
3323                  */
3324 #if defined(CONFIG_DRM_AMD_DC_SI)
3325                 return amdgpu_dc > 0;
3326 #else
3327                 return false;
3328 #endif
3329         case CHIP_BONAIRE:
3330         case CHIP_KAVERI:
3331         case CHIP_KABINI:
3332         case CHIP_MULLINS:
3333                 /*
3334                  * We have systems in the wild with these ASICs that require
3335                  * VGA support which is not supported with DC.
3336                  *
3337                  * Fallback to the non-DC driver here by default so as not to
3338                  * cause regressions.
3339                  */
3340                 return amdgpu_dc > 0;
3341         default:
3342                 return amdgpu_dc != 0;
3343 #else
3344         default:
3345                 if (amdgpu_dc > 0)
3346                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3347                                          "but isn't supported by ASIC, ignoring\n");
3348                 return false;
3349 #endif
3350         }
3351 }
3352
3353 /**
3354  * amdgpu_device_has_dc_support - check if dc is supported
3355  *
3356  * @adev: amdgpu_device pointer
3357  *
3358  * Returns true for supported, false for not supported
3359  */
3360 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3361 {
3362         if (adev->enable_virtual_display ||
3363             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3364                 return false;
3365
3366         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3367 }
3368
3369 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3370 {
3371         struct amdgpu_device *adev =
3372                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3373         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3374
3375         /* It's a bug to not have a hive within this function */
3376         if (WARN_ON(!hive))
3377                 return;
3378
3379         /*
3380          * Use task barrier to synchronize all xgmi reset works across the
3381          * hive. task_barrier_enter and task_barrier_exit will block
3382          * until all the threads running the xgmi reset works reach
3383          * those points. task_barrier_full will do both blocks.
3384          */
3385         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3386
3387                 task_barrier_enter(&hive->tb);
3388                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3389
3390                 if (adev->asic_reset_res)
3391                         goto fail;
3392
3393                 task_barrier_exit(&hive->tb);
3394                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3395
3396                 if (adev->asic_reset_res)
3397                         goto fail;
3398
3399                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3400                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3401                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3402         } else {
3403
3404                 task_barrier_full(&hive->tb);
3405                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3406         }
3407
3408 fail:
3409         if (adev->asic_reset_res)
3410                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3411                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3412         amdgpu_put_xgmi_hive(hive);
3413 }
3414
3415 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3416 {
3417         char *input = amdgpu_lockup_timeout;
3418         char *timeout_setting = NULL;
3419         int index = 0;
3420         long timeout;
3421         int ret = 0;
3422
3423         /*
3424          * By default timeout for non compute jobs is 10000
3425          * and 60000 for compute jobs.
3426          * In SR-IOV or passthrough mode, timeout for compute
3427          * jobs are 60000 by default.
3428          */
3429         adev->gfx_timeout = msecs_to_jiffies(10000);
3430         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3431         if (amdgpu_sriov_vf(adev))
3432                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3433                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3434         else
3435                 adev->compute_timeout =  msecs_to_jiffies(60000);
3436
3437         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3438                 while ((timeout_setting = strsep(&input, ",")) &&
3439                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3440                         ret = kstrtol(timeout_setting, 0, &timeout);
3441                         if (ret)
3442                                 return ret;
3443
3444                         if (timeout == 0) {
3445                                 index++;
3446                                 continue;
3447                         } else if (timeout < 0) {
3448                                 timeout = MAX_SCHEDULE_TIMEOUT;
3449                                 dev_warn(adev->dev, "lockup timeout disabled");
3450                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3451                         } else {
3452                                 timeout = msecs_to_jiffies(timeout);
3453                         }
3454
3455                         switch (index++) {
3456                         case 0:
3457                                 adev->gfx_timeout = timeout;
3458                                 break;
3459                         case 1:
3460                                 adev->compute_timeout = timeout;
3461                                 break;
3462                         case 2:
3463                                 adev->sdma_timeout = timeout;
3464                                 break;
3465                         case 3:
3466                                 adev->video_timeout = timeout;
3467                                 break;
3468                         default:
3469                                 break;
3470                         }
3471                 }
3472                 /*
3473                  * There is only one value specified and
3474                  * it should apply to all non-compute jobs.
3475                  */
3476                 if (index == 1) {
3477                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3478                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3479                                 adev->compute_timeout = adev->gfx_timeout;
3480                 }
3481         }
3482
3483         return ret;
3484 }
3485
3486 /**
3487  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3488  *
3489  * @adev: amdgpu_device pointer
3490  *
3491  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3492  */
3493 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3494 {
3495         struct iommu_domain *domain;
3496
3497         domain = iommu_get_domain_for_dev(adev->dev);
3498         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3499                 adev->ram_is_direct_mapped = true;
3500 }
3501
3502 static const struct attribute *amdgpu_dev_attributes[] = {
3503         &dev_attr_product_name.attr,
3504         &dev_attr_product_number.attr,
3505         &dev_attr_serial_number.attr,
3506         &dev_attr_pcie_replay_count.attr,
3507         NULL
3508 };
3509
3510 /**
3511  * amdgpu_device_init - initialize the driver
3512  *
3513  * @adev: amdgpu_device pointer
3514  * @flags: driver flags
3515  *
3516  * Initializes the driver info and hw (all asics).
3517  * Returns 0 for success or an error on failure.
3518  * Called at driver startup.
3519  */
3520 int amdgpu_device_init(struct amdgpu_device *adev,
3521                        uint32_t flags)
3522 {
3523         struct drm_device *ddev = adev_to_drm(adev);
3524         struct pci_dev *pdev = adev->pdev;
3525         int r, i;
3526         bool px = false;
3527         u32 max_MBps;
3528
3529         adev->shutdown = false;
3530         adev->flags = flags;
3531
3532         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3533                 adev->asic_type = amdgpu_force_asic_type;
3534         else
3535                 adev->asic_type = flags & AMD_ASIC_MASK;
3536
3537         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3538         if (amdgpu_emu_mode == 1)
3539                 adev->usec_timeout *= 10;
3540         adev->gmc.gart_size = 512 * 1024 * 1024;
3541         adev->accel_working = false;
3542         adev->num_rings = 0;
3543         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3544         adev->mman.buffer_funcs = NULL;
3545         adev->mman.buffer_funcs_ring = NULL;
3546         adev->vm_manager.vm_pte_funcs = NULL;
3547         adev->vm_manager.vm_pte_num_scheds = 0;
3548         adev->gmc.gmc_funcs = NULL;
3549         adev->harvest_ip_mask = 0x0;
3550         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3551         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3552
3553         adev->smc_rreg = &amdgpu_invalid_rreg;
3554         adev->smc_wreg = &amdgpu_invalid_wreg;
3555         adev->pcie_rreg = &amdgpu_invalid_rreg;
3556         adev->pcie_wreg = &amdgpu_invalid_wreg;
3557         adev->pciep_rreg = &amdgpu_invalid_rreg;
3558         adev->pciep_wreg = &amdgpu_invalid_wreg;
3559         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3560         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3561         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3562         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3563         adev->didt_rreg = &amdgpu_invalid_rreg;
3564         adev->didt_wreg = &amdgpu_invalid_wreg;
3565         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3566         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3567         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3568         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3569
3570         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3571                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3572                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3573
3574         /* mutex initialization are all done here so we
3575          * can recall function without having locking issues */
3576         mutex_init(&adev->firmware.mutex);
3577         mutex_init(&adev->pm.mutex);
3578         mutex_init(&adev->gfx.gpu_clock_mutex);
3579         mutex_init(&adev->srbm_mutex);
3580         mutex_init(&adev->gfx.pipe_reserve_mutex);
3581         mutex_init(&adev->gfx.gfx_off_mutex);
3582         mutex_init(&adev->grbm_idx_mutex);
3583         mutex_init(&adev->mn_lock);
3584         mutex_init(&adev->virt.vf_errors.lock);
3585         hash_init(adev->mn_hash);
3586         mutex_init(&adev->psp.mutex);
3587         mutex_init(&adev->notifier_lock);
3588         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3589         mutex_init(&adev->benchmark_mutex);
3590
3591         amdgpu_device_init_apu_flags(adev);
3592
3593         r = amdgpu_device_check_arguments(adev);
3594         if (r)
3595                 return r;
3596
3597         spin_lock_init(&adev->mmio_idx_lock);
3598         spin_lock_init(&adev->smc_idx_lock);
3599         spin_lock_init(&adev->pcie_idx_lock);
3600         spin_lock_init(&adev->uvd_ctx_idx_lock);
3601         spin_lock_init(&adev->didt_idx_lock);
3602         spin_lock_init(&adev->gc_cac_idx_lock);
3603         spin_lock_init(&adev->se_cac_idx_lock);
3604         spin_lock_init(&adev->audio_endpt_idx_lock);
3605         spin_lock_init(&adev->mm_stats.lock);
3606
3607         INIT_LIST_HEAD(&adev->shadow_list);
3608         mutex_init(&adev->shadow_list_lock);
3609
3610         INIT_LIST_HEAD(&adev->reset_list);
3611
3612         INIT_LIST_HEAD(&adev->ras_list);
3613
3614         INIT_DELAYED_WORK(&adev->delayed_init_work,
3615                           amdgpu_device_delayed_init_work_handler);
3616         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3617                           amdgpu_device_delay_enable_gfx_off);
3618
3619         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3620
3621         adev->gfx.gfx_off_req_count = 1;
3622         adev->gfx.gfx_off_residency = 0;
3623         adev->gfx.gfx_off_entrycount = 0;
3624         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3625
3626         atomic_set(&adev->throttling_logging_enabled, 1);
3627         /*
3628          * If throttling continues, logging will be performed every minute
3629          * to avoid log flooding. "-1" is subtracted since the thermal
3630          * throttling interrupt comes every second. Thus, the total logging
3631          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3632          * for throttling interrupt) = 60 seconds.
3633          */
3634         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3635         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3636
3637         /* Registers mapping */
3638         /* TODO: block userspace mapping of io register */
3639         if (adev->asic_type >= CHIP_BONAIRE) {
3640                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3641                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3642         } else {
3643                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3644                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3645         }
3646
3647         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3648                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3649
3650         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3651         if (adev->rmmio == NULL) {
3652                 return -ENOMEM;
3653         }
3654         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3655         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3656
3657         amdgpu_device_get_pcie_info(adev);
3658
3659         if (amdgpu_mcbp)
3660                 DRM_INFO("MCBP is enabled\n");
3661
3662         /*
3663          * Reset domain needs to be present early, before XGMI hive discovered
3664          * (if any) and intitialized to use reset sem and in_gpu reset flag
3665          * early on during init and before calling to RREG32.
3666          */
3667         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3668         if (!adev->reset_domain)
3669                 return -ENOMEM;
3670
3671         /* detect hw virtualization here */
3672         amdgpu_detect_virtualization(adev);
3673
3674         r = amdgpu_device_get_job_timeout_settings(adev);
3675         if (r) {
3676                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3677                 return r;
3678         }
3679
3680         /* early init functions */
3681         r = amdgpu_device_ip_early_init(adev);
3682         if (r)
3683                 return r;
3684
3685         /* Enable TMZ based on IP_VERSION */
3686         amdgpu_gmc_tmz_set(adev);
3687
3688         amdgpu_gmc_noretry_set(adev);
3689         /* Need to get xgmi info early to decide the reset behavior*/
3690         if (adev->gmc.xgmi.supported) {
3691                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3692                 if (r)
3693                         return r;
3694         }
3695
3696         /* enable PCIE atomic ops */
3697         if (amdgpu_sriov_vf(adev))
3698                 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3699                         adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3700                         (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3701         else
3702                 adev->have_atomics_support =
3703                         !pci_enable_atomic_ops_to_root(adev->pdev,
3704                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3705                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3706         if (!adev->have_atomics_support)
3707                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3708
3709         /* doorbell bar mapping and doorbell index init*/
3710         amdgpu_device_doorbell_init(adev);
3711
3712         if (amdgpu_emu_mode == 1) {
3713                 /* post the asic on emulation mode */
3714                 emu_soc_asic_init(adev);
3715                 goto fence_driver_init;
3716         }
3717
3718         amdgpu_reset_init(adev);
3719
3720         /* detect if we are with an SRIOV vbios */
3721         amdgpu_device_detect_sriov_bios(adev);
3722
3723         /* check if we need to reset the asic
3724          *  E.g., driver was not cleanly unloaded previously, etc.
3725          */
3726         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3727                 if (adev->gmc.xgmi.num_physical_nodes) {
3728                         dev_info(adev->dev, "Pending hive reset.\n");
3729                         adev->gmc.xgmi.pending_reset = true;
3730                         /* Only need to init necessary block for SMU to handle the reset */
3731                         for (i = 0; i < adev->num_ip_blocks; i++) {
3732                                 if (!adev->ip_blocks[i].status.valid)
3733                                         continue;
3734                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3735                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3736                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3737                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3738                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3739                                                 adev->ip_blocks[i].version->funcs->name);
3740                                         adev->ip_blocks[i].status.hw = true;
3741                                 }
3742                         }
3743                 } else {
3744                         r = amdgpu_asic_reset(adev);
3745                         if (r) {
3746                                 dev_err(adev->dev, "asic reset on init failed\n");
3747                                 goto failed;
3748                         }
3749                 }
3750         }
3751
3752         pci_enable_pcie_error_reporting(adev->pdev);
3753
3754         /* Post card if necessary */
3755         if (amdgpu_device_need_post(adev)) {
3756                 if (!adev->bios) {
3757                         dev_err(adev->dev, "no vBIOS found\n");
3758                         r = -EINVAL;
3759                         goto failed;
3760                 }
3761                 DRM_INFO("GPU posting now...\n");
3762                 r = amdgpu_device_asic_init(adev);
3763                 if (r) {
3764                         dev_err(adev->dev, "gpu post error!\n");
3765                         goto failed;
3766                 }
3767         }
3768
3769         if (adev->is_atom_fw) {
3770                 /* Initialize clocks */
3771                 r = amdgpu_atomfirmware_get_clock_info(adev);
3772                 if (r) {
3773                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3774                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3775                         goto failed;
3776                 }
3777         } else {
3778                 /* Initialize clocks */
3779                 r = amdgpu_atombios_get_clock_info(adev);
3780                 if (r) {
3781                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3782                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3783                         goto failed;
3784                 }
3785                 /* init i2c buses */
3786                 if (!amdgpu_device_has_dc_support(adev))
3787                         amdgpu_atombios_i2c_init(adev);
3788         }
3789
3790 fence_driver_init:
3791         /* Fence driver */
3792         r = amdgpu_fence_driver_sw_init(adev);
3793         if (r) {
3794                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3795                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3796                 goto failed;
3797         }
3798
3799         /* init the mode config */
3800         drm_mode_config_init(adev_to_drm(adev));
3801
3802         r = amdgpu_device_ip_init(adev);
3803         if (r) {
3804                 /* failed in exclusive mode due to timeout */
3805                 if (amdgpu_sriov_vf(adev) &&
3806                     !amdgpu_sriov_runtime(adev) &&
3807                     amdgpu_virt_mmio_blocked(adev) &&
3808                     !amdgpu_virt_wait_reset(adev)) {
3809                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3810                         /* Don't send request since VF is inactive. */
3811                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3812                         adev->virt.ops = NULL;
3813                         r = -EAGAIN;
3814                         goto release_ras_con;
3815                 }
3816                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3817                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3818                 goto release_ras_con;
3819         }
3820
3821         amdgpu_fence_driver_hw_init(adev);
3822
3823         dev_info(adev->dev,
3824                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3825                         adev->gfx.config.max_shader_engines,
3826                         adev->gfx.config.max_sh_per_se,
3827                         adev->gfx.config.max_cu_per_sh,
3828                         adev->gfx.cu_info.number);
3829
3830         adev->accel_working = true;
3831
3832         amdgpu_vm_check_compute_bug(adev);
3833
3834         /* Initialize the buffer migration limit. */
3835         if (amdgpu_moverate >= 0)
3836                 max_MBps = amdgpu_moverate;
3837         else
3838                 max_MBps = 8; /* Allow 8 MB/s. */
3839         /* Get a log2 for easy divisions. */
3840         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3841
3842         r = amdgpu_pm_sysfs_init(adev);
3843         if (r) {
3844                 adev->pm_sysfs_en = false;
3845                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3846         } else
3847                 adev->pm_sysfs_en = true;
3848
3849         r = amdgpu_ucode_sysfs_init(adev);
3850         if (r) {
3851                 adev->ucode_sysfs_en = false;
3852                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3853         } else
3854                 adev->ucode_sysfs_en = true;
3855
3856         r = amdgpu_psp_sysfs_init(adev);
3857         if (r) {
3858                 adev->psp_sysfs_en = false;
3859                 if (!amdgpu_sriov_vf(adev))
3860                         DRM_ERROR("Creating psp sysfs failed\n");
3861         } else
3862                 adev->psp_sysfs_en = true;
3863
3864         /*
3865          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3866          * Otherwise the mgpu fan boost feature will be skipped due to the
3867          * gpu instance is counted less.
3868          */
3869         amdgpu_register_gpu_instance(adev);
3870
3871         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3872          * explicit gating rather than handling it automatically.
3873          */
3874         if (!adev->gmc.xgmi.pending_reset) {
3875                 r = amdgpu_device_ip_late_init(adev);
3876                 if (r) {
3877                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3878                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3879                         goto release_ras_con;
3880                 }
3881                 /* must succeed. */
3882                 amdgpu_ras_resume(adev);
3883                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3884                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3885         }
3886
3887         if (amdgpu_sriov_vf(adev))
3888                 flush_delayed_work(&adev->delayed_init_work);
3889
3890         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3891         if (r)
3892                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3893
3894         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3895                 r = amdgpu_pmu_init(adev);
3896         if (r)
3897                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3898
3899         /* Have stored pci confspace at hand for restore in sudden PCI error */
3900         if (amdgpu_device_cache_pci_state(adev->pdev))
3901                 pci_restore_state(pdev);
3902
3903         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3904         /* this will fail for cards that aren't VGA class devices, just
3905          * ignore it */
3906         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3907                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3908
3909         if (amdgpu_device_supports_px(ddev)) {
3910                 px = true;
3911                 vga_switcheroo_register_client(adev->pdev,
3912                                                &amdgpu_switcheroo_ops, px);
3913                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3914         }
3915
3916         if (adev->gmc.xgmi.pending_reset)
3917                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3918                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3919
3920         amdgpu_device_check_iommu_direct_map(adev);
3921
3922         return 0;
3923
3924 release_ras_con:
3925         amdgpu_release_ras_context(adev);
3926
3927 failed:
3928         amdgpu_vf_error_trans_all(adev);
3929
3930         return r;
3931 }
3932
3933 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3934 {
3935
3936         /* Clear all CPU mappings pointing to this device */
3937         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3938
3939         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3940         amdgpu_device_doorbell_fini(adev);
3941
3942         iounmap(adev->rmmio);
3943         adev->rmmio = NULL;
3944         if (adev->mman.aper_base_kaddr)
3945                 iounmap(adev->mman.aper_base_kaddr);
3946         adev->mman.aper_base_kaddr = NULL;
3947
3948         /* Memory manager related */
3949         if (!adev->gmc.xgmi.connected_to_cpu) {
3950                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3951                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3952         }
3953 }
3954
3955 /**
3956  * amdgpu_device_fini_hw - tear down the driver
3957  *
3958  * @adev: amdgpu_device pointer
3959  *
3960  * Tear down the driver info (all asics).
3961  * Called at driver shutdown.
3962  */
3963 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3964 {
3965         dev_info(adev->dev, "amdgpu: finishing device.\n");
3966         flush_delayed_work(&adev->delayed_init_work);
3967         adev->shutdown = true;
3968
3969         /* make sure IB test finished before entering exclusive mode
3970          * to avoid preemption on IB test
3971          * */
3972         if (amdgpu_sriov_vf(adev)) {
3973                 amdgpu_virt_request_full_gpu(adev, false);
3974                 amdgpu_virt_fini_data_exchange(adev);
3975         }
3976
3977         /* disable all interrupts */
3978         amdgpu_irq_disable_all(adev);
3979         if (adev->mode_info.mode_config_initialized){
3980                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3981                         drm_helper_force_disable_all(adev_to_drm(adev));
3982                 else
3983                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3984         }
3985         amdgpu_fence_driver_hw_fini(adev);
3986
3987         if (adev->mman.initialized)
3988                 drain_workqueue(adev->mman.bdev.wq);
3989
3990         if (adev->pm_sysfs_en)
3991                 amdgpu_pm_sysfs_fini(adev);
3992         if (adev->ucode_sysfs_en)
3993                 amdgpu_ucode_sysfs_fini(adev);
3994         if (adev->psp_sysfs_en)
3995                 amdgpu_psp_sysfs_fini(adev);
3996         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3997
3998         /* disable ras feature must before hw fini */
3999         amdgpu_ras_pre_fini(adev);
4000
4001         amdgpu_device_ip_fini_early(adev);
4002
4003         amdgpu_irq_fini_hw(adev);
4004
4005         if (adev->mman.initialized)
4006                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4007
4008         amdgpu_gart_dummy_page_fini(adev);
4009
4010         amdgpu_device_unmap_mmio(adev);
4011
4012 }
4013
4014 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4015 {
4016         int idx;
4017
4018         amdgpu_fence_driver_sw_fini(adev);
4019         amdgpu_device_ip_fini(adev);
4020         release_firmware(adev->firmware.gpu_info_fw);
4021         adev->firmware.gpu_info_fw = NULL;
4022         adev->accel_working = false;
4023         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4024
4025         amdgpu_reset_fini(adev);
4026
4027         /* free i2c buses */
4028         if (!amdgpu_device_has_dc_support(adev))
4029                 amdgpu_i2c_fini(adev);
4030
4031         if (amdgpu_emu_mode != 1)
4032                 amdgpu_atombios_fini(adev);
4033
4034         kfree(adev->bios);
4035         adev->bios = NULL;
4036         if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4037                 vga_switcheroo_unregister_client(adev->pdev);
4038                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4039         }
4040         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4041                 vga_client_unregister(adev->pdev);
4042
4043         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4044
4045                 iounmap(adev->rmmio);
4046                 adev->rmmio = NULL;
4047                 amdgpu_device_doorbell_fini(adev);
4048                 drm_dev_exit(idx);
4049         }
4050
4051         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4052                 amdgpu_pmu_fini(adev);
4053         if (adev->mman.discovery_bin)
4054                 amdgpu_discovery_fini(adev);
4055
4056         amdgpu_reset_put_reset_domain(adev->reset_domain);
4057         adev->reset_domain = NULL;
4058
4059         kfree(adev->pci_state);
4060
4061 }
4062
4063 /**
4064  * amdgpu_device_evict_resources - evict device resources
4065  * @adev: amdgpu device object
4066  *
4067  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4068  * of the vram memory type. Mainly used for evicting device resources
4069  * at suspend time.
4070  *
4071  */
4072 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4073 {
4074         int ret;
4075
4076         /* No need to evict vram on APUs for suspend to ram or s2idle */
4077         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4078                 return 0;
4079
4080         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4081         if (ret)
4082                 DRM_WARN("evicting device resources failed\n");
4083         return ret;
4084 }
4085
4086 /*
4087  * Suspend & resume.
4088  */
4089 /**
4090  * amdgpu_device_suspend - initiate device suspend
4091  *
4092  * @dev: drm dev pointer
4093  * @fbcon : notify the fbdev of suspend
4094  *
4095  * Puts the hw in the suspend state (all asics).
4096  * Returns 0 for success or an error on failure.
4097  * Called at driver suspend.
4098  */
4099 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4100 {
4101         struct amdgpu_device *adev = drm_to_adev(dev);
4102         int r = 0;
4103
4104         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4105                 return 0;
4106
4107         adev->in_suspend = true;
4108
4109         if (amdgpu_sriov_vf(adev)) {
4110                 amdgpu_virt_fini_data_exchange(adev);
4111                 r = amdgpu_virt_request_full_gpu(adev, false);
4112                 if (r)
4113                         return r;
4114         }
4115
4116         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4117                 DRM_WARN("smart shift update failed\n");
4118
4119         drm_kms_helper_poll_disable(dev);
4120
4121         if (fbcon)
4122                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4123
4124         cancel_delayed_work_sync(&adev->delayed_init_work);
4125
4126         amdgpu_ras_suspend(adev);
4127
4128         amdgpu_device_ip_suspend_phase1(adev);
4129
4130         if (!adev->in_s0ix)
4131                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4132
4133         r = amdgpu_device_evict_resources(adev);
4134         if (r)
4135                 return r;
4136
4137         amdgpu_fence_driver_hw_fini(adev);
4138
4139         amdgpu_device_ip_suspend_phase2(adev);
4140
4141         if (amdgpu_sriov_vf(adev))
4142                 amdgpu_virt_release_full_gpu(adev, false);
4143
4144         return 0;
4145 }
4146
4147 /**
4148  * amdgpu_device_resume - initiate device resume
4149  *
4150  * @dev: drm dev pointer
4151  * @fbcon : notify the fbdev of resume
4152  *
4153  * Bring the hw back to operating state (all asics).
4154  * Returns 0 for success or an error on failure.
4155  * Called at driver resume.
4156  */
4157 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4158 {
4159         struct amdgpu_device *adev = drm_to_adev(dev);
4160         int r = 0;
4161
4162         if (amdgpu_sriov_vf(adev)) {
4163                 r = amdgpu_virt_request_full_gpu(adev, true);
4164                 if (r)
4165                         return r;
4166         }
4167
4168         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4169                 return 0;
4170
4171         if (adev->in_s0ix)
4172                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4173
4174         /* post card */
4175         if (amdgpu_device_need_post(adev)) {
4176                 r = amdgpu_device_asic_init(adev);
4177                 if (r)
4178                         dev_err(adev->dev, "amdgpu asic init failed\n");
4179         }
4180
4181         r = amdgpu_device_ip_resume(adev);
4182
4183         /* no matter what r is, always need to properly release full GPU */
4184         if (amdgpu_sriov_vf(adev)) {
4185                 amdgpu_virt_init_data_exchange(adev);
4186                 amdgpu_virt_release_full_gpu(adev, true);
4187         }
4188
4189         if (r) {
4190                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4191                 return r;
4192         }
4193         amdgpu_fence_driver_hw_init(adev);
4194
4195         r = amdgpu_device_ip_late_init(adev);
4196         if (r)
4197                 return r;
4198
4199         queue_delayed_work(system_wq, &adev->delayed_init_work,
4200                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4201
4202         if (!adev->in_s0ix) {
4203                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4204                 if (r)
4205                         return r;
4206         }
4207
4208         /* Make sure IB tests flushed */
4209         if (amdgpu_sriov_vf(adev))
4210                 amdgpu_irq_gpu_reset_resume_helper(adev);
4211         flush_delayed_work(&adev->delayed_init_work);
4212
4213         if (adev->in_s0ix) {
4214                 /* re-enable gfxoff after IP resume. This re-enables gfxoff after
4215                  * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4216                  */
4217                 amdgpu_gfx_off_ctrl(adev, true);
4218                 DRM_DEBUG("will enable gfxoff for the mission mode\n");
4219         }
4220         if (fbcon)
4221                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4222
4223         drm_kms_helper_poll_enable(dev);
4224
4225         amdgpu_ras_resume(adev);
4226
4227         if (adev->mode_info.num_crtc) {
4228                 /*
4229                  * Most of the connector probing functions try to acquire runtime pm
4230                  * refs to ensure that the GPU is powered on when connector polling is
4231                  * performed. Since we're calling this from a runtime PM callback,
4232                  * trying to acquire rpm refs will cause us to deadlock.
4233                  *
4234                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4235                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4236                  */
4237 #ifdef CONFIG_PM
4238                 dev->dev->power.disable_depth++;
4239 #endif
4240                 if (!adev->dc_enabled)
4241                         drm_helper_hpd_irq_event(dev);
4242                 else
4243                         drm_kms_helper_hotplug_event(dev);
4244 #ifdef CONFIG_PM
4245                 dev->dev->power.disable_depth--;
4246 #endif
4247         }
4248         adev->in_suspend = false;
4249
4250         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4251                 DRM_WARN("smart shift update failed\n");
4252
4253         return 0;
4254 }
4255
4256 /**
4257  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4258  *
4259  * @adev: amdgpu_device pointer
4260  *
4261  * The list of all the hardware IPs that make up the asic is walked and
4262  * the check_soft_reset callbacks are run.  check_soft_reset determines
4263  * if the asic is still hung or not.
4264  * Returns true if any of the IPs are still in a hung state, false if not.
4265  */
4266 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4267 {
4268         int i;
4269         bool asic_hang = false;
4270
4271         if (amdgpu_sriov_vf(adev))
4272                 return true;
4273
4274         if (amdgpu_asic_need_full_reset(adev))
4275                 return true;
4276
4277         for (i = 0; i < adev->num_ip_blocks; i++) {
4278                 if (!adev->ip_blocks[i].status.valid)
4279                         continue;
4280                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4281                         adev->ip_blocks[i].status.hang =
4282                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4283                 if (adev->ip_blocks[i].status.hang) {
4284                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4285                         asic_hang = true;
4286                 }
4287         }
4288         return asic_hang;
4289 }
4290
4291 /**
4292  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4293  *
4294  * @adev: amdgpu_device pointer
4295  *
4296  * The list of all the hardware IPs that make up the asic is walked and the
4297  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4298  * handles any IP specific hardware or software state changes that are
4299  * necessary for a soft reset to succeed.
4300  * Returns 0 on success, negative error code on failure.
4301  */
4302 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4303 {
4304         int i, r = 0;
4305
4306         for (i = 0; i < adev->num_ip_blocks; i++) {
4307                 if (!adev->ip_blocks[i].status.valid)
4308                         continue;
4309                 if (adev->ip_blocks[i].status.hang &&
4310                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4311                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4312                         if (r)
4313                                 return r;
4314                 }
4315         }
4316
4317         return 0;
4318 }
4319
4320 /**
4321  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4322  *
4323  * @adev: amdgpu_device pointer
4324  *
4325  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4326  * reset is necessary to recover.
4327  * Returns true if a full asic reset is required, false if not.
4328  */
4329 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4330 {
4331         int i;
4332
4333         if (amdgpu_asic_need_full_reset(adev))
4334                 return true;
4335
4336         for (i = 0; i < adev->num_ip_blocks; i++) {
4337                 if (!adev->ip_blocks[i].status.valid)
4338                         continue;
4339                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4340                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4341                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4342                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4343                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4344                         if (adev->ip_blocks[i].status.hang) {
4345                                 dev_info(adev->dev, "Some block need full reset!\n");
4346                                 return true;
4347                         }
4348                 }
4349         }
4350         return false;
4351 }
4352
4353 /**
4354  * amdgpu_device_ip_soft_reset - do a soft reset
4355  *
4356  * @adev: amdgpu_device pointer
4357  *
4358  * The list of all the hardware IPs that make up the asic is walked and the
4359  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4360  * IP specific hardware or software state changes that are necessary to soft
4361  * reset the IP.
4362  * Returns 0 on success, negative error code on failure.
4363  */
4364 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4365 {
4366         int i, r = 0;
4367
4368         for (i = 0; i < adev->num_ip_blocks; i++) {
4369                 if (!adev->ip_blocks[i].status.valid)
4370                         continue;
4371                 if (adev->ip_blocks[i].status.hang &&
4372                     adev->ip_blocks[i].version->funcs->soft_reset) {
4373                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4374                         if (r)
4375                                 return r;
4376                 }
4377         }
4378
4379         return 0;
4380 }
4381
4382 /**
4383  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4384  *
4385  * @adev: amdgpu_device pointer
4386  *
4387  * The list of all the hardware IPs that make up the asic is walked and the
4388  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4389  * handles any IP specific hardware or software state changes that are
4390  * necessary after the IP has been soft reset.
4391  * Returns 0 on success, negative error code on failure.
4392  */
4393 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4394 {
4395         int i, r = 0;
4396
4397         for (i = 0; i < adev->num_ip_blocks; i++) {
4398                 if (!adev->ip_blocks[i].status.valid)
4399                         continue;
4400                 if (adev->ip_blocks[i].status.hang &&
4401                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4402                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4403                 if (r)
4404                         return r;
4405         }
4406
4407         return 0;
4408 }
4409
4410 /**
4411  * amdgpu_device_recover_vram - Recover some VRAM contents
4412  *
4413  * @adev: amdgpu_device pointer
4414  *
4415  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4416  * restore things like GPUVM page tables after a GPU reset where
4417  * the contents of VRAM might be lost.
4418  *
4419  * Returns:
4420  * 0 on success, negative error code on failure.
4421  */
4422 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4423 {
4424         struct dma_fence *fence = NULL, *next = NULL;
4425         struct amdgpu_bo *shadow;
4426         struct amdgpu_bo_vm *vmbo;
4427         long r = 1, tmo;
4428
4429         if (amdgpu_sriov_runtime(adev))
4430                 tmo = msecs_to_jiffies(8000);
4431         else
4432                 tmo = msecs_to_jiffies(100);
4433
4434         dev_info(adev->dev, "recover vram bo from shadow start\n");
4435         mutex_lock(&adev->shadow_list_lock);
4436         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4437                 shadow = &vmbo->bo;
4438                 /* No need to recover an evicted BO */
4439                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4440                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4441                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4442                         continue;
4443
4444                 r = amdgpu_bo_restore_shadow(shadow, &next);
4445                 if (r)
4446                         break;
4447
4448                 if (fence) {
4449                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4450                         dma_fence_put(fence);
4451                         fence = next;
4452                         if (tmo == 0) {
4453                                 r = -ETIMEDOUT;
4454                                 break;
4455                         } else if (tmo < 0) {
4456                                 r = tmo;
4457                                 break;
4458                         }
4459                 } else {
4460                         fence = next;
4461                 }
4462         }
4463         mutex_unlock(&adev->shadow_list_lock);
4464
4465         if (fence)
4466                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4467         dma_fence_put(fence);
4468
4469         if (r < 0 || tmo <= 0) {
4470                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4471                 return -EIO;
4472         }
4473
4474         dev_info(adev->dev, "recover vram bo from shadow done\n");
4475         return 0;
4476 }
4477
4478
4479 /**
4480  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4481  *
4482  * @adev: amdgpu_device pointer
4483  * @from_hypervisor: request from hypervisor
4484  *
4485  * do VF FLR and reinitialize Asic
4486  * return 0 means succeeded otherwise failed
4487  */
4488 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4489                                      bool from_hypervisor)
4490 {
4491         int r;
4492         struct amdgpu_hive_info *hive = NULL;
4493         int retry_limit = 0;
4494
4495 retry:
4496         amdgpu_amdkfd_pre_reset(adev);
4497
4498         if (from_hypervisor)
4499                 r = amdgpu_virt_request_full_gpu(adev, true);
4500         else
4501                 r = amdgpu_virt_reset_gpu(adev);
4502         if (r)
4503                 return r;
4504
4505         /* Resume IP prior to SMC */
4506         r = amdgpu_device_ip_reinit_early_sriov(adev);
4507         if (r)
4508                 goto error;
4509
4510         amdgpu_virt_init_data_exchange(adev);
4511
4512         r = amdgpu_device_fw_loading(adev);
4513         if (r)
4514                 return r;
4515
4516         /* now we are okay to resume SMC/CP/SDMA */
4517         r = amdgpu_device_ip_reinit_late_sriov(adev);
4518         if (r)
4519                 goto error;
4520
4521         hive = amdgpu_get_xgmi_hive(adev);
4522         /* Update PSP FW topology after reset */
4523         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4524                 r = amdgpu_xgmi_update_topology(hive, adev);
4525
4526         if (hive)
4527                 amdgpu_put_xgmi_hive(hive);
4528
4529         if (!r) {
4530                 amdgpu_irq_gpu_reset_resume_helper(adev);
4531                 r = amdgpu_ib_ring_tests(adev);
4532
4533                 amdgpu_amdkfd_post_reset(adev);
4534         }
4535
4536 error:
4537         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4538                 amdgpu_inc_vram_lost(adev);
4539                 r = amdgpu_device_recover_vram(adev);
4540         }
4541         amdgpu_virt_release_full_gpu(adev, true);
4542
4543         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4544                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4545                         retry_limit++;
4546                         goto retry;
4547                 } else
4548                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4549         }
4550
4551         return r;
4552 }
4553
4554 /**
4555  * amdgpu_device_has_job_running - check if there is any job in mirror list
4556  *
4557  * @adev: amdgpu_device pointer
4558  *
4559  * check if there is any job in mirror list
4560  */
4561 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4562 {
4563         int i;
4564         struct drm_sched_job *job;
4565
4566         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4567                 struct amdgpu_ring *ring = adev->rings[i];
4568
4569                 if (!ring || !ring->sched.thread)
4570                         continue;
4571
4572                 spin_lock(&ring->sched.job_list_lock);
4573                 job = list_first_entry_or_null(&ring->sched.pending_list,
4574                                                struct drm_sched_job, list);
4575                 spin_unlock(&ring->sched.job_list_lock);
4576                 if (job)
4577                         return true;
4578         }
4579         return false;
4580 }
4581
4582 /**
4583  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4584  *
4585  * @adev: amdgpu_device pointer
4586  *
4587  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4588  * a hung GPU.
4589  */
4590 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4591 {
4592
4593         if (amdgpu_gpu_recovery == 0)
4594                 goto disabled;
4595
4596         /* Skip soft reset check in fatal error mode */
4597         if (!amdgpu_ras_is_poison_mode_supported(adev))
4598                 return true;
4599
4600         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4601                 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
4602                 return false;
4603         }
4604
4605         if (amdgpu_sriov_vf(adev))
4606                 return true;
4607
4608         if (amdgpu_gpu_recovery == -1) {
4609                 switch (adev->asic_type) {
4610 #ifdef CONFIG_DRM_AMDGPU_SI
4611                 case CHIP_VERDE:
4612                 case CHIP_TAHITI:
4613                 case CHIP_PITCAIRN:
4614                 case CHIP_OLAND:
4615                 case CHIP_HAINAN:
4616 #endif
4617 #ifdef CONFIG_DRM_AMDGPU_CIK
4618                 case CHIP_KAVERI:
4619                 case CHIP_KABINI:
4620                 case CHIP_MULLINS:
4621 #endif
4622                 case CHIP_CARRIZO:
4623                 case CHIP_STONEY:
4624                 case CHIP_CYAN_SKILLFISH:
4625                         goto disabled;
4626                 default:
4627                         break;
4628                 }
4629         }
4630
4631         return true;
4632
4633 disabled:
4634                 dev_info(adev->dev, "GPU recovery disabled.\n");
4635                 return false;
4636 }
4637
4638 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4639 {
4640         u32 i;
4641         int ret = 0;
4642
4643         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4644
4645         dev_info(adev->dev, "GPU mode1 reset\n");
4646
4647         /* disable BM */
4648         pci_clear_master(adev->pdev);
4649
4650         amdgpu_device_cache_pci_state(adev->pdev);
4651
4652         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4653                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4654                 ret = amdgpu_dpm_mode1_reset(adev);
4655         } else {
4656                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4657                 ret = psp_gpu_reset(adev);
4658         }
4659
4660         if (ret)
4661                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4662
4663         amdgpu_device_load_pci_state(adev->pdev);
4664
4665         /* wait for asic to come out of reset */
4666         for (i = 0; i < adev->usec_timeout; i++) {
4667                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4668
4669                 if (memsize != 0xffffffff)
4670                         break;
4671                 udelay(1);
4672         }
4673
4674         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4675         return ret;
4676 }
4677
4678 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4679                                  struct amdgpu_reset_context *reset_context)
4680 {
4681         int i, r = 0;
4682         struct amdgpu_job *job = NULL;
4683         bool need_full_reset =
4684                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4685
4686         if (reset_context->reset_req_dev == adev)
4687                 job = reset_context->job;
4688
4689         if (amdgpu_sriov_vf(adev)) {
4690                 /* stop the data exchange thread */
4691                 amdgpu_virt_fini_data_exchange(adev);
4692         }
4693
4694         amdgpu_fence_driver_isr_toggle(adev, true);
4695
4696         /* block all schedulers and reset given job's ring */
4697         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4698                 struct amdgpu_ring *ring = adev->rings[i];
4699
4700                 if (!ring || !ring->sched.thread)
4701                         continue;
4702
4703                 /*clear job fence from fence drv to avoid force_completion
4704                  *leave NULL and vm flush fence in fence drv */
4705                 amdgpu_fence_driver_clear_job_fences(ring);
4706
4707                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4708                 amdgpu_fence_driver_force_completion(ring);
4709         }
4710
4711         amdgpu_fence_driver_isr_toggle(adev, false);
4712
4713         if (job && job->vm)
4714                 drm_sched_increase_karma(&job->base);
4715
4716         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4717         /* If reset handler not implemented, continue; otherwise return */
4718         if (r == -ENOSYS)
4719                 r = 0;
4720         else
4721                 return r;
4722
4723         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4724         if (!amdgpu_sriov_vf(adev)) {
4725
4726                 if (!need_full_reset)
4727                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4728
4729                 if (!need_full_reset && amdgpu_gpu_recovery) {
4730                         amdgpu_device_ip_pre_soft_reset(adev);
4731                         r = amdgpu_device_ip_soft_reset(adev);
4732                         amdgpu_device_ip_post_soft_reset(adev);
4733                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4734                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4735                                 need_full_reset = true;
4736                         }
4737                 }
4738
4739                 if (need_full_reset)
4740                         r = amdgpu_device_ip_suspend(adev);
4741                 if (need_full_reset)
4742                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4743                 else
4744                         clear_bit(AMDGPU_NEED_FULL_RESET,
4745                                   &reset_context->flags);
4746         }
4747
4748         return r;
4749 }
4750
4751 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4752 {
4753         int i;
4754
4755         lockdep_assert_held(&adev->reset_domain->sem);
4756
4757         for (i = 0; i < adev->num_regs; i++) {
4758                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4759                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4760                                              adev->reset_dump_reg_value[i]);
4761         }
4762
4763         return 0;
4764 }
4765
4766 #ifdef CONFIG_DEV_COREDUMP
4767 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4768                 size_t count, void *data, size_t datalen)
4769 {
4770         struct drm_printer p;
4771         struct amdgpu_device *adev = data;
4772         struct drm_print_iterator iter;
4773         int i;
4774
4775         iter.data = buffer;
4776         iter.offset = 0;
4777         iter.start = offset;
4778         iter.remain = count;
4779
4780         p = drm_coredump_printer(&iter);
4781
4782         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4783         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4784         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4785         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4786         if (adev->reset_task_info.pid)
4787                 drm_printf(&p, "process_name: %s PID: %d\n",
4788                            adev->reset_task_info.process_name,
4789                            adev->reset_task_info.pid);
4790
4791         if (adev->reset_vram_lost)
4792                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4793         if (adev->num_regs) {
4794                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4795
4796                 for (i = 0; i < adev->num_regs; i++)
4797                         drm_printf(&p, "0x%08x: 0x%08x\n",
4798                                    adev->reset_dump_reg_list[i],
4799                                    adev->reset_dump_reg_value[i]);
4800         }
4801
4802         return count - iter.remain;
4803 }
4804
4805 static void amdgpu_devcoredump_free(void *data)
4806 {
4807 }
4808
4809 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4810 {
4811         struct drm_device *dev = adev_to_drm(adev);
4812
4813         ktime_get_ts64(&adev->reset_time);
4814         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4815                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4816 }
4817 #endif
4818
4819 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4820                          struct amdgpu_reset_context *reset_context)
4821 {
4822         struct amdgpu_device *tmp_adev = NULL;
4823         bool need_full_reset, skip_hw_reset, vram_lost = false;
4824         int r = 0;
4825         bool gpu_reset_for_dev_remove = 0;
4826
4827         /* Try reset handler method first */
4828         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4829                                     reset_list);
4830         amdgpu_reset_reg_dumps(tmp_adev);
4831
4832         reset_context->reset_device_list = device_list_handle;
4833         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4834         /* If reset handler not implemented, continue; otherwise return */
4835         if (r == -ENOSYS)
4836                 r = 0;
4837         else
4838                 return r;
4839
4840         /* Reset handler not implemented, use the default method */
4841         need_full_reset =
4842                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4843         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4844
4845         gpu_reset_for_dev_remove =
4846                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4847                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4848
4849         /*
4850          * ASIC reset has to be done on all XGMI hive nodes ASAP
4851          * to allow proper links negotiation in FW (within 1 sec)
4852          */
4853         if (!skip_hw_reset && need_full_reset) {
4854                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4855                         /* For XGMI run all resets in parallel to speed up the process */
4856                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4857                                 tmp_adev->gmc.xgmi.pending_reset = false;
4858                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4859                                         r = -EALREADY;
4860                         } else
4861                                 r = amdgpu_asic_reset(tmp_adev);
4862
4863                         if (r) {
4864                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4865                                          r, adev_to_drm(tmp_adev)->unique);
4866                                 break;
4867                         }
4868                 }
4869
4870                 /* For XGMI wait for all resets to complete before proceed */
4871                 if (!r) {
4872                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4873                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4874                                         flush_work(&tmp_adev->xgmi_reset_work);
4875                                         r = tmp_adev->asic_reset_res;
4876                                         if (r)
4877                                                 break;
4878                                 }
4879                         }
4880                 }
4881         }
4882
4883         if (!r && amdgpu_ras_intr_triggered()) {
4884                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4885                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4886                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4887                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4888                 }
4889
4890                 amdgpu_ras_intr_cleared();
4891         }
4892
4893         /* Since the mode1 reset affects base ip blocks, the
4894          * phase1 ip blocks need to be resumed. Otherwise there
4895          * will be a BIOS signature error and the psp bootloader
4896          * can't load kdb on the next amdgpu install.
4897          */
4898         if (gpu_reset_for_dev_remove) {
4899                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4900                         amdgpu_device_ip_resume_phase1(tmp_adev);
4901
4902                 goto end;
4903         }
4904
4905         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4906                 if (need_full_reset) {
4907                         /* post card */
4908                         r = amdgpu_device_asic_init(tmp_adev);
4909                         if (r) {
4910                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4911                         } else {
4912                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4913                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4914                                 if (r)
4915                                         goto out;
4916
4917                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4918                                 if (r)
4919                                         goto out;
4920
4921                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4922 #ifdef CONFIG_DEV_COREDUMP
4923                                 tmp_adev->reset_vram_lost = vram_lost;
4924                                 memset(&tmp_adev->reset_task_info, 0,
4925                                                 sizeof(tmp_adev->reset_task_info));
4926                                 if (reset_context->job && reset_context->job->vm)
4927                                         tmp_adev->reset_task_info =
4928                                                 reset_context->job->vm->task_info;
4929                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4930 #endif
4931                                 if (vram_lost) {
4932                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4933                                         amdgpu_inc_vram_lost(tmp_adev);
4934                                 }
4935
4936                                 r = amdgpu_device_fw_loading(tmp_adev);
4937                                 if (r)
4938                                         return r;
4939
4940                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4941                                 if (r)
4942                                         goto out;
4943
4944                                 if (vram_lost)
4945                                         amdgpu_device_fill_reset_magic(tmp_adev);
4946
4947                                 /*
4948                                  * Add this ASIC as tracked as reset was already
4949                                  * complete successfully.
4950                                  */
4951                                 amdgpu_register_gpu_instance(tmp_adev);
4952
4953                                 if (!reset_context->hive &&
4954                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4955                                         amdgpu_xgmi_add_device(tmp_adev);
4956
4957                                 r = amdgpu_device_ip_late_init(tmp_adev);
4958                                 if (r)
4959                                         goto out;
4960
4961                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4962
4963                                 /*
4964                                  * The GPU enters bad state once faulty pages
4965                                  * by ECC has reached the threshold, and ras
4966                                  * recovery is scheduled next. So add one check
4967                                  * here to break recovery if it indeed exceeds
4968                                  * bad page threshold, and remind user to
4969                                  * retire this GPU or setting one bigger
4970                                  * bad_page_threshold value to fix this once
4971                                  * probing driver again.
4972                                  */
4973                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4974                                         /* must succeed. */
4975                                         amdgpu_ras_resume(tmp_adev);
4976                                 } else {
4977                                         r = -EINVAL;
4978                                         goto out;
4979                                 }
4980
4981                                 /* Update PSP FW topology after reset */
4982                                 if (reset_context->hive &&
4983                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4984                                         r = amdgpu_xgmi_update_topology(
4985                                                 reset_context->hive, tmp_adev);
4986                         }
4987                 }
4988
4989 out:
4990                 if (!r) {
4991                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4992                         r = amdgpu_ib_ring_tests(tmp_adev);
4993                         if (r) {
4994                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4995                                 need_full_reset = true;
4996                                 r = -EAGAIN;
4997                                 goto end;
4998                         }
4999                 }
5000
5001                 if (!r)
5002                         r = amdgpu_device_recover_vram(tmp_adev);
5003                 else
5004                         tmp_adev->asic_reset_res = r;
5005         }
5006
5007 end:
5008         if (need_full_reset)
5009                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5010         else
5011                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5012         return r;
5013 }
5014
5015 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5016 {
5017
5018         switch (amdgpu_asic_reset_method(adev)) {
5019         case AMD_RESET_METHOD_MODE1:
5020                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5021                 break;
5022         case AMD_RESET_METHOD_MODE2:
5023                 adev->mp1_state = PP_MP1_STATE_RESET;
5024                 break;
5025         default:
5026                 adev->mp1_state = PP_MP1_STATE_NONE;
5027                 break;
5028         }
5029 }
5030
5031 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5032 {
5033         amdgpu_vf_error_trans_all(adev);
5034         adev->mp1_state = PP_MP1_STATE_NONE;
5035 }
5036
5037 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5038 {
5039         struct pci_dev *p = NULL;
5040
5041         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5042                         adev->pdev->bus->number, 1);
5043         if (p) {
5044                 pm_runtime_enable(&(p->dev));
5045                 pm_runtime_resume(&(p->dev));
5046         }
5047 }
5048
5049 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5050 {
5051         enum amd_reset_method reset_method;
5052         struct pci_dev *p = NULL;
5053         u64 expires;
5054
5055         /*
5056          * For now, only BACO and mode1 reset are confirmed
5057          * to suffer the audio issue without proper suspended.
5058          */
5059         reset_method = amdgpu_asic_reset_method(adev);
5060         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5061              (reset_method != AMD_RESET_METHOD_MODE1))
5062                 return -EINVAL;
5063
5064         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5065                         adev->pdev->bus->number, 1);
5066         if (!p)
5067                 return -ENODEV;
5068
5069         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5070         if (!expires)
5071                 /*
5072                  * If we cannot get the audio device autosuspend delay,
5073                  * a fixed 4S interval will be used. Considering 3S is
5074                  * the audio controller default autosuspend delay setting.
5075                  * 4S used here is guaranteed to cover that.
5076                  */
5077                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5078
5079         while (!pm_runtime_status_suspended(&(p->dev))) {
5080                 if (!pm_runtime_suspend(&(p->dev)))
5081                         break;
5082
5083                 if (expires < ktime_get_mono_fast_ns()) {
5084                         dev_warn(adev->dev, "failed to suspend display audio\n");
5085                         /* TODO: abort the succeeding gpu reset? */
5086                         return -ETIMEDOUT;
5087                 }
5088         }
5089
5090         pm_runtime_disable(&(p->dev));
5091
5092         return 0;
5093 }
5094
5095 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5096 {
5097         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5098
5099 #if defined(CONFIG_DEBUG_FS)
5100         if (!amdgpu_sriov_vf(adev))
5101                 cancel_work(&adev->reset_work);
5102 #endif
5103
5104         if (adev->kfd.dev)
5105                 cancel_work(&adev->kfd.reset_work);
5106
5107         if (amdgpu_sriov_vf(adev))
5108                 cancel_work(&adev->virt.flr_work);
5109
5110         if (con && adev->ras_enabled)
5111                 cancel_work(&con->recovery_work);
5112
5113 }
5114
5115 /**
5116  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5117  *
5118  * @adev: amdgpu_device pointer
5119  * @job: which job trigger hang
5120  *
5121  * Attempt to reset the GPU if it has hung (all asics).
5122  * Attempt to do soft-reset or full-reset and reinitialize Asic
5123  * Returns 0 for success or an error on failure.
5124  */
5125
5126 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5127                               struct amdgpu_job *job,
5128                               struct amdgpu_reset_context *reset_context)
5129 {
5130         struct list_head device_list, *device_list_handle =  NULL;
5131         bool job_signaled = false;
5132         struct amdgpu_hive_info *hive = NULL;
5133         struct amdgpu_device *tmp_adev = NULL;
5134         int i, r = 0;
5135         bool need_emergency_restart = false;
5136         bool audio_suspended = false;
5137         bool gpu_reset_for_dev_remove = false;
5138
5139         gpu_reset_for_dev_remove =
5140                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5141                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5142
5143         /*
5144          * Special case: RAS triggered and full reset isn't supported
5145          */
5146         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5147
5148         /*
5149          * Flush RAM to disk so that after reboot
5150          * the user can read log and see why the system rebooted.
5151          */
5152         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5153                 DRM_WARN("Emergency reboot.");
5154
5155                 ksys_sync_helper();
5156                 emergency_restart();
5157         }
5158
5159         dev_info(adev->dev, "GPU %s begin!\n",
5160                 need_emergency_restart ? "jobs stop":"reset");
5161
5162         if (!amdgpu_sriov_vf(adev))
5163                 hive = amdgpu_get_xgmi_hive(adev);
5164         if (hive)
5165                 mutex_lock(&hive->hive_lock);
5166
5167         reset_context->job = job;
5168         reset_context->hive = hive;
5169         /*
5170          * Build list of devices to reset.
5171          * In case we are in XGMI hive mode, resort the device list
5172          * to put adev in the 1st position.
5173          */
5174         INIT_LIST_HEAD(&device_list);
5175         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5176                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5177                         list_add_tail(&tmp_adev->reset_list, &device_list);
5178                         if (gpu_reset_for_dev_remove && adev->shutdown)
5179                                 tmp_adev->shutdown = true;
5180                 }
5181                 if (!list_is_first(&adev->reset_list, &device_list))
5182                         list_rotate_to_front(&adev->reset_list, &device_list);
5183                 device_list_handle = &device_list;
5184         } else {
5185                 list_add_tail(&adev->reset_list, &device_list);
5186                 device_list_handle = &device_list;
5187         }
5188
5189         /* We need to lock reset domain only once both for XGMI and single device */
5190         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5191                                     reset_list);
5192         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5193
5194         /* block all schedulers and reset given job's ring */
5195         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5196
5197                 amdgpu_device_set_mp1_state(tmp_adev);
5198
5199                 /*
5200                  * Try to put the audio codec into suspend state
5201                  * before gpu reset started.
5202                  *
5203                  * Due to the power domain of the graphics device
5204                  * is shared with AZ power domain. Without this,
5205                  * we may change the audio hardware from behind
5206                  * the audio driver's back. That will trigger
5207                  * some audio codec errors.
5208                  */
5209                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5210                         audio_suspended = true;
5211
5212                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5213
5214                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5215
5216                 if (!amdgpu_sriov_vf(tmp_adev))
5217                         amdgpu_amdkfd_pre_reset(tmp_adev);
5218
5219                 /*
5220                  * Mark these ASICs to be reseted as untracked first
5221                  * And add them back after reset completed
5222                  */
5223                 amdgpu_unregister_gpu_instance(tmp_adev);
5224
5225                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5226
5227                 /* disable ras on ALL IPs */
5228                 if (!need_emergency_restart &&
5229                       amdgpu_device_ip_need_full_reset(tmp_adev))
5230                         amdgpu_ras_suspend(tmp_adev);
5231
5232                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5233                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5234
5235                         if (!ring || !ring->sched.thread)
5236                                 continue;
5237
5238                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5239
5240                         if (need_emergency_restart)
5241                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5242                 }
5243                 atomic_inc(&tmp_adev->gpu_reset_counter);
5244         }
5245
5246         if (need_emergency_restart)
5247                 goto skip_sched_resume;
5248
5249         /*
5250          * Must check guilty signal here since after this point all old
5251          * HW fences are force signaled.
5252          *
5253          * job->base holds a reference to parent fence
5254          */
5255         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5256                 job_signaled = true;
5257                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5258                 goto skip_hw_reset;
5259         }
5260
5261 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5262         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5263                 if (gpu_reset_for_dev_remove) {
5264                         /* Workaroud for ASICs need to disable SMC first */
5265                         amdgpu_device_smu_fini_early(tmp_adev);
5266                 }
5267                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5268                 /*TODO Should we stop ?*/
5269                 if (r) {
5270                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5271                                   r, adev_to_drm(tmp_adev)->unique);
5272                         tmp_adev->asic_reset_res = r;
5273                 }
5274
5275                 /*
5276                  * Drop all pending non scheduler resets. Scheduler resets
5277                  * were already dropped during drm_sched_stop
5278                  */
5279                 amdgpu_device_stop_pending_resets(tmp_adev);
5280         }
5281
5282         /* Actual ASIC resets if needed.*/
5283         /* Host driver will handle XGMI hive reset for SRIOV */
5284         if (amdgpu_sriov_vf(adev)) {
5285                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5286                 if (r)
5287                         adev->asic_reset_res = r;
5288
5289                 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5290                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5291                         amdgpu_ras_resume(adev);
5292         } else {
5293                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5294                 if (r && r == -EAGAIN)
5295                         goto retry;
5296
5297                 if (!r && gpu_reset_for_dev_remove)
5298                         goto recover_end;
5299         }
5300
5301 skip_hw_reset:
5302
5303         /* Post ASIC reset for all devs .*/
5304         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5305
5306                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5307                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5308
5309                         if (!ring || !ring->sched.thread)
5310                                 continue;
5311
5312                         drm_sched_start(&ring->sched, true);
5313                 }
5314
5315                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5316                         amdgpu_mes_self_test(tmp_adev);
5317
5318                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5319                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5320                 }
5321
5322                 if (tmp_adev->asic_reset_res)
5323                         r = tmp_adev->asic_reset_res;
5324
5325                 tmp_adev->asic_reset_res = 0;
5326
5327                 if (r) {
5328                         /* bad news, how to tell it to userspace ? */
5329                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5330                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5331                 } else {
5332                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5333                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5334                                 DRM_WARN("smart shift update failed\n");
5335                 }
5336         }
5337
5338 skip_sched_resume:
5339         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5340                 /* unlock kfd: SRIOV would do it separately */
5341                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5342                         amdgpu_amdkfd_post_reset(tmp_adev);
5343
5344                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5345                  * need to bring up kfd here if it's not be initialized before
5346                  */
5347                 if (!adev->kfd.init_complete)
5348                         amdgpu_amdkfd_device_init(adev);
5349
5350                 if (audio_suspended)
5351                         amdgpu_device_resume_display_audio(tmp_adev);
5352
5353                 amdgpu_device_unset_mp1_state(tmp_adev);
5354
5355                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5356         }
5357
5358 recover_end:
5359         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5360                                             reset_list);
5361         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5362
5363         if (hive) {
5364                 mutex_unlock(&hive->hive_lock);
5365                 amdgpu_put_xgmi_hive(hive);
5366         }
5367
5368         if (r)
5369                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5370
5371         atomic_set(&adev->reset_domain->reset_res, r);
5372         return r;
5373 }
5374
5375 /**
5376  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5377  *
5378  * @adev: amdgpu_device pointer
5379  *
5380  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5381  * and lanes) of the slot the device is in. Handles APUs and
5382  * virtualized environments where PCIE config space may not be available.
5383  */
5384 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5385 {
5386         struct pci_dev *pdev;
5387         enum pci_bus_speed speed_cap, platform_speed_cap;
5388         enum pcie_link_width platform_link_width;
5389
5390         if (amdgpu_pcie_gen_cap)
5391                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5392
5393         if (amdgpu_pcie_lane_cap)
5394                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5395
5396         /* covers APUs as well */
5397         if (pci_is_root_bus(adev->pdev->bus)) {
5398                 if (adev->pm.pcie_gen_mask == 0)
5399                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5400                 if (adev->pm.pcie_mlw_mask == 0)
5401                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5402                 return;
5403         }
5404
5405         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5406                 return;
5407
5408         pcie_bandwidth_available(adev->pdev, NULL,
5409                                  &platform_speed_cap, &platform_link_width);
5410
5411         if (adev->pm.pcie_gen_mask == 0) {
5412                 /* asic caps */
5413                 pdev = adev->pdev;
5414                 speed_cap = pcie_get_speed_cap(pdev);
5415                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5416                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5417                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5418                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5419                 } else {
5420                         if (speed_cap == PCIE_SPEED_32_0GT)
5421                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5422                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5423                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5424                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5425                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5426                         else if (speed_cap == PCIE_SPEED_16_0GT)
5427                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5428                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5429                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5430                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5431                         else if (speed_cap == PCIE_SPEED_8_0GT)
5432                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5433                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5434                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5435                         else if (speed_cap == PCIE_SPEED_5_0GT)
5436                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5437                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5438                         else
5439                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5440                 }
5441                 /* platform caps */
5442                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5443                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5444                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5445                 } else {
5446                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5447                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5448                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5449                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5450                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5451                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5452                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5453                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5454                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5455                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5456                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5457                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5458                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5459                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5460                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5461                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5462                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5463                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5464                         else
5465                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5466
5467                 }
5468         }
5469         if (adev->pm.pcie_mlw_mask == 0) {
5470                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5471                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5472                 } else {
5473                         switch (platform_link_width) {
5474                         case PCIE_LNK_X32:
5475                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5476                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5477                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5478                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5479                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5480                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5481                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5482                                 break;
5483                         case PCIE_LNK_X16:
5484                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5485                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5486                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5487                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5488                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5489                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5490                                 break;
5491                         case PCIE_LNK_X12:
5492                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5493                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5494                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5495                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5496                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5497                                 break;
5498                         case PCIE_LNK_X8:
5499                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5500                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5501                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5502                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5503                                 break;
5504                         case PCIE_LNK_X4:
5505                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5506                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5507                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5508                                 break;
5509                         case PCIE_LNK_X2:
5510                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5511                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5512                                 break;
5513                         case PCIE_LNK_X1:
5514                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5515                                 break;
5516                         default:
5517                                 break;
5518                         }
5519                 }
5520         }
5521 }
5522
5523 /**
5524  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5525  *
5526  * @adev: amdgpu_device pointer
5527  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5528  *
5529  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5530  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5531  * @peer_adev.
5532  */
5533 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5534                                       struct amdgpu_device *peer_adev)
5535 {
5536 #ifdef CONFIG_HSA_AMD_P2P
5537         uint64_t address_mask = peer_adev->dev->dma_mask ?
5538                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5539         resource_size_t aper_limit =
5540                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5541         bool p2p_access =
5542                 !adev->gmc.xgmi.connected_to_cpu &&
5543                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5544
5545         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5546                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5547                 !(adev->gmc.aper_base & address_mask ||
5548                   aper_limit & address_mask));
5549 #else
5550         return false;
5551 #endif
5552 }
5553
5554 int amdgpu_device_baco_enter(struct drm_device *dev)
5555 {
5556         struct amdgpu_device *adev = drm_to_adev(dev);
5557         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5558
5559         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5560                 return -ENOTSUPP;
5561
5562         if (ras && adev->ras_enabled &&
5563             adev->nbio.funcs->enable_doorbell_interrupt)
5564                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5565
5566         return amdgpu_dpm_baco_enter(adev);
5567 }
5568
5569 int amdgpu_device_baco_exit(struct drm_device *dev)
5570 {
5571         struct amdgpu_device *adev = drm_to_adev(dev);
5572         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5573         int ret = 0;
5574
5575         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5576                 return -ENOTSUPP;
5577
5578         ret = amdgpu_dpm_baco_exit(adev);
5579         if (ret)
5580                 return ret;
5581
5582         if (ras && adev->ras_enabled &&
5583             adev->nbio.funcs->enable_doorbell_interrupt)
5584                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5585
5586         if (amdgpu_passthrough(adev) &&
5587             adev->nbio.funcs->clear_doorbell_interrupt)
5588                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5589
5590         return 0;
5591 }
5592
5593 /**
5594  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5595  * @pdev: PCI device struct
5596  * @state: PCI channel state
5597  *
5598  * Description: Called when a PCI error is detected.
5599  *
5600  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5601  */
5602 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5603 {
5604         struct drm_device *dev = pci_get_drvdata(pdev);
5605         struct amdgpu_device *adev = drm_to_adev(dev);
5606         int i;
5607
5608         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5609
5610         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5611                 DRM_WARN("No support for XGMI hive yet...");
5612                 return PCI_ERS_RESULT_DISCONNECT;
5613         }
5614
5615         adev->pci_channel_state = state;
5616
5617         switch (state) {
5618         case pci_channel_io_normal:
5619                 return PCI_ERS_RESULT_CAN_RECOVER;
5620         /* Fatal error, prepare for slot reset */
5621         case pci_channel_io_frozen:
5622                 /*
5623                  * Locking adev->reset_domain->sem will prevent any external access
5624                  * to GPU during PCI error recovery
5625                  */
5626                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5627                 amdgpu_device_set_mp1_state(adev);
5628
5629                 /*
5630                  * Block any work scheduling as we do for regular GPU reset
5631                  * for the duration of the recovery
5632                  */
5633                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5634                         struct amdgpu_ring *ring = adev->rings[i];
5635
5636                         if (!ring || !ring->sched.thread)
5637                                 continue;
5638
5639                         drm_sched_stop(&ring->sched, NULL);
5640                 }
5641                 atomic_inc(&adev->gpu_reset_counter);
5642                 return PCI_ERS_RESULT_NEED_RESET;
5643         case pci_channel_io_perm_failure:
5644                 /* Permanent error, prepare for device removal */
5645                 return PCI_ERS_RESULT_DISCONNECT;
5646         }
5647
5648         return PCI_ERS_RESULT_NEED_RESET;
5649 }
5650
5651 /**
5652  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5653  * @pdev: pointer to PCI device
5654  */
5655 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5656 {
5657
5658         DRM_INFO("PCI error: mmio enabled callback!!\n");
5659
5660         /* TODO - dump whatever for debugging purposes */
5661
5662         /* This called only if amdgpu_pci_error_detected returns
5663          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5664          * works, no need to reset slot.
5665          */
5666
5667         return PCI_ERS_RESULT_RECOVERED;
5668 }
5669
5670 /**
5671  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5672  * @pdev: PCI device struct
5673  *
5674  * Description: This routine is called by the pci error recovery
5675  * code after the PCI slot has been reset, just before we
5676  * should resume normal operations.
5677  */
5678 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5679 {
5680         struct drm_device *dev = pci_get_drvdata(pdev);
5681         struct amdgpu_device *adev = drm_to_adev(dev);
5682         int r, i;
5683         struct amdgpu_reset_context reset_context;
5684         u32 memsize;
5685         struct list_head device_list;
5686
5687         DRM_INFO("PCI error: slot reset callback!!\n");
5688
5689         memset(&reset_context, 0, sizeof(reset_context));
5690
5691         INIT_LIST_HEAD(&device_list);
5692         list_add_tail(&adev->reset_list, &device_list);
5693
5694         /* wait for asic to come out of reset */
5695         msleep(500);
5696
5697         /* Restore PCI confspace */
5698         amdgpu_device_load_pci_state(pdev);
5699
5700         /* confirm  ASIC came out of reset */
5701         for (i = 0; i < adev->usec_timeout; i++) {
5702                 memsize = amdgpu_asic_get_config_memsize(adev);
5703
5704                 if (memsize != 0xffffffff)
5705                         break;
5706                 udelay(1);
5707         }
5708         if (memsize == 0xffffffff) {
5709                 r = -ETIME;
5710                 goto out;
5711         }
5712
5713         reset_context.method = AMD_RESET_METHOD_NONE;
5714         reset_context.reset_req_dev = adev;
5715         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5716         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5717
5718         adev->no_hw_access = true;
5719         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5720         adev->no_hw_access = false;
5721         if (r)
5722                 goto out;
5723
5724         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5725
5726 out:
5727         if (!r) {
5728                 if (amdgpu_device_cache_pci_state(adev->pdev))
5729                         pci_restore_state(adev->pdev);
5730
5731                 DRM_INFO("PCIe error recovery succeeded\n");
5732         } else {
5733                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5734                 amdgpu_device_unset_mp1_state(adev);
5735                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5736         }
5737
5738         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5739 }
5740
5741 /**
5742  * amdgpu_pci_resume() - resume normal ops after PCI reset
5743  * @pdev: pointer to PCI device
5744  *
5745  * Called when the error recovery driver tells us that its
5746  * OK to resume normal operation.
5747  */
5748 void amdgpu_pci_resume(struct pci_dev *pdev)
5749 {
5750         struct drm_device *dev = pci_get_drvdata(pdev);
5751         struct amdgpu_device *adev = drm_to_adev(dev);
5752         int i;
5753
5754
5755         DRM_INFO("PCI error: resume callback!!\n");
5756
5757         /* Only continue execution for the case of pci_channel_io_frozen */
5758         if (adev->pci_channel_state != pci_channel_io_frozen)
5759                 return;
5760
5761         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5762                 struct amdgpu_ring *ring = adev->rings[i];
5763
5764                 if (!ring || !ring->sched.thread)
5765                         continue;
5766
5767                 drm_sched_start(&ring->sched, true);
5768         }
5769
5770         amdgpu_device_unset_mp1_state(adev);
5771         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5772 }
5773
5774 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5775 {
5776         struct drm_device *dev = pci_get_drvdata(pdev);
5777         struct amdgpu_device *adev = drm_to_adev(dev);
5778         int r;
5779
5780         r = pci_save_state(pdev);
5781         if (!r) {
5782                 kfree(adev->pci_state);
5783
5784                 adev->pci_state = pci_store_saved_state(pdev);
5785
5786                 if (!adev->pci_state) {
5787                         DRM_ERROR("Failed to store PCI saved state");
5788                         return false;
5789                 }
5790         } else {
5791                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5792                 return false;
5793         }
5794
5795         return true;
5796 }
5797
5798 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5799 {
5800         struct drm_device *dev = pci_get_drvdata(pdev);
5801         struct amdgpu_device *adev = drm_to_adev(dev);
5802         int r;
5803
5804         if (!adev->pci_state)
5805                 return false;
5806
5807         r = pci_load_saved_state(pdev, adev->pci_state);
5808
5809         if (!r) {
5810                 pci_restore_state(pdev);
5811         } else {
5812                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5813                 return false;
5814         }
5815
5816         return true;
5817 }
5818
5819 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5820                 struct amdgpu_ring *ring)
5821 {
5822 #ifdef CONFIG_X86_64
5823         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5824                 return;
5825 #endif
5826         if (adev->gmc.xgmi.connected_to_cpu)
5827                 return;
5828
5829         if (ring && ring->funcs->emit_hdp_flush)
5830                 amdgpu_ring_emit_hdp_flush(ring);
5831         else
5832                 amdgpu_asic_flush_hdp(adev, ring);
5833 }
5834
5835 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5836                 struct amdgpu_ring *ring)
5837 {
5838 #ifdef CONFIG_X86_64
5839         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5840                 return;
5841 #endif
5842         if (adev->gmc.xgmi.connected_to_cpu)
5843                 return;
5844
5845         amdgpu_asic_invalidate_hdp(adev, ring);
5846 }
5847
5848 int amdgpu_in_reset(struct amdgpu_device *adev)
5849 {
5850         return atomic_read(&adev->reset_domain->in_gpu_reset);
5851         }
5852         
5853 /**
5854  * amdgpu_device_halt() - bring hardware to some kind of halt state
5855  *
5856  * @adev: amdgpu_device pointer
5857  *
5858  * Bring hardware to some kind of halt state so that no one can touch it
5859  * any more. It will help to maintain error context when error occurred.
5860  * Compare to a simple hang, the system will keep stable at least for SSH
5861  * access. Then it should be trivial to inspect the hardware state and
5862  * see what's going on. Implemented as following:
5863  *
5864  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5865  *    clears all CPU mappings to device, disallows remappings through page faults
5866  * 2. amdgpu_irq_disable_all() disables all interrupts
5867  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5868  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5869  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5870  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5871  *    flush any in flight DMA operations
5872  */
5873 void amdgpu_device_halt(struct amdgpu_device *adev)
5874 {
5875         struct pci_dev *pdev = adev->pdev;
5876         struct drm_device *ddev = adev_to_drm(adev);
5877
5878         drm_dev_unplug(ddev);
5879
5880         amdgpu_irq_disable_all(adev);
5881
5882         amdgpu_fence_driver_hw_fini(adev);
5883
5884         adev->no_hw_access = true;
5885
5886         amdgpu_device_unmap_mmio(adev);
5887
5888         pci_disable_device(pdev);
5889         pci_wait_for_pending_transaction(pdev);
5890 }
5891
5892 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5893                                 u32 reg)
5894 {
5895         unsigned long flags, address, data;
5896         u32 r;
5897
5898         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5899         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5900
5901         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5902         WREG32(address, reg * 4);
5903         (void)RREG32(address);
5904         r = RREG32(data);
5905         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5906         return r;
5907 }
5908
5909 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5910                                 u32 reg, u32 v)
5911 {
5912         unsigned long flags, address, data;
5913
5914         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5915         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5916
5917         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5918         WREG32(address, reg * 4);
5919         (void)RREG32(address);
5920         WREG32(data, v);
5921         (void)RREG32(data);
5922         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5923 }
5924
5925 /**
5926  * amdgpu_device_switch_gang - switch to a new gang
5927  * @adev: amdgpu_device pointer
5928  * @gang: the gang to switch to
5929  *
5930  * Try to switch to a new gang.
5931  * Returns: NULL if we switched to the new gang or a reference to the current
5932  * gang leader.
5933  */
5934 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5935                                             struct dma_fence *gang)
5936 {
5937         struct dma_fence *old = NULL;
5938
5939         do {
5940                 dma_fence_put(old);
5941                 rcu_read_lock();
5942                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5943                 rcu_read_unlock();
5944
5945                 if (old == gang)
5946                         break;
5947
5948                 if (!dma_fence_is_signaled(old))
5949                         return old;
5950
5951         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5952                          old, gang) != old);
5953
5954         dma_fence_put(old);
5955         return NULL;
5956 }
5957
5958 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
5959 {
5960         switch (adev->asic_type) {
5961 #ifdef CONFIG_DRM_AMDGPU_SI
5962         case CHIP_HAINAN:
5963 #endif
5964         case CHIP_TOPAZ:
5965                 /* chips with no display hardware */
5966                 return false;
5967 #ifdef CONFIG_DRM_AMDGPU_SI
5968         case CHIP_TAHITI:
5969         case CHIP_PITCAIRN:
5970         case CHIP_VERDE:
5971         case CHIP_OLAND:
5972 #endif
5973 #ifdef CONFIG_DRM_AMDGPU_CIK
5974         case CHIP_BONAIRE:
5975         case CHIP_HAWAII:
5976         case CHIP_KAVERI:
5977         case CHIP_KABINI:
5978         case CHIP_MULLINS:
5979 #endif
5980         case CHIP_TONGA:
5981         case CHIP_FIJI:
5982         case CHIP_POLARIS10:
5983         case CHIP_POLARIS11:
5984         case CHIP_POLARIS12:
5985         case CHIP_VEGAM:
5986         case CHIP_CARRIZO:
5987         case CHIP_STONEY:
5988                 /* chips with display hardware */
5989                 return true;
5990         default:
5991                 /* IP discovery */
5992                 if (!adev->ip_versions[DCE_HWIP][0] ||
5993                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
5994                         return false;
5995                 return true;
5996         }
5997 }
This page took 0.393257 seconds and 4 git commands to generate.