]> Git Repo - linux.git/blob - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Merge tag 'drm/tegra/for-5.10-rc1' of ssh://git.freedesktop.org/git/tegra/linux into...
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
85
86 #define AMDGPU_RESUME_MS                2000
87
88 const char *amdgpu_asic_name[] = {
89         "TAHITI",
90         "PITCAIRN",
91         "VERDE",
92         "OLAND",
93         "HAINAN",
94         "BONAIRE",
95         "KAVERI",
96         "KABINI",
97         "HAWAII",
98         "MULLINS",
99         "TOPAZ",
100         "TONGA",
101         "FIJI",
102         "CARRIZO",
103         "STONEY",
104         "POLARIS10",
105         "POLARIS11",
106         "POLARIS12",
107         "VEGAM",
108         "VEGA10",
109         "VEGA12",
110         "VEGA20",
111         "RAVEN",
112         "ARCTURUS",
113         "RENOIR",
114         "NAVI10",
115         "NAVI14",
116         "NAVI12",
117         "SIENNA_CICHLID",
118         "NAVY_FLOUNDER",
119         "LAST",
120 };
121
122 /**
123  * DOC: pcie_replay_count
124  *
125  * The amdgpu driver provides a sysfs API for reporting the total number
126  * of PCIe replays (NAKs)
127  * The file pcie_replay_count is used for this and returns the total
128  * number of replays as a sum of the NAKs generated and NAKs received
129  */
130
131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
132                 struct device_attribute *attr, char *buf)
133 {
134         struct drm_device *ddev = dev_get_drvdata(dev);
135         struct amdgpu_device *adev = drm_to_adev(ddev);
136         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
137
138         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
139 }
140
141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
142                 amdgpu_device_get_pcie_replay_count, NULL);
143
144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
145
146 /**
147  * DOC: product_name
148  *
149  * The amdgpu driver provides a sysfs API for reporting the product name
150  * for the device
151  * The file serial_number is used for this and returns the product name
152  * as returned from the FRU.
153  * NOTE: This is only available for certain server cards
154  */
155
156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
157                 struct device_attribute *attr, char *buf)
158 {
159         struct drm_device *ddev = dev_get_drvdata(dev);
160         struct amdgpu_device *adev = drm_to_adev(ddev);
161
162         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
163 }
164
165 static DEVICE_ATTR(product_name, S_IRUGO,
166                 amdgpu_device_get_product_name, NULL);
167
168 /**
169  * DOC: product_number
170  *
171  * The amdgpu driver provides a sysfs API for reporting the part number
172  * for the device
173  * The file serial_number is used for this and returns the part number
174  * as returned from the FRU.
175  * NOTE: This is only available for certain server cards
176  */
177
178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
179                 struct device_attribute *attr, char *buf)
180 {
181         struct drm_device *ddev = dev_get_drvdata(dev);
182         struct amdgpu_device *adev = drm_to_adev(ddev);
183
184         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
185 }
186
187 static DEVICE_ATTR(product_number, S_IRUGO,
188                 amdgpu_device_get_product_number, NULL);
189
190 /**
191  * DOC: serial_number
192  *
193  * The amdgpu driver provides a sysfs API for reporting the serial number
194  * for the device
195  * The file serial_number is used for this and returns the serial number
196  * as returned from the FRU.
197  * NOTE: This is only available for certain server cards
198  */
199
200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
201                 struct device_attribute *attr, char *buf)
202 {
203         struct drm_device *ddev = dev_get_drvdata(dev);
204         struct amdgpu_device *adev = drm_to_adev(ddev);
205
206         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
207 }
208
209 static DEVICE_ATTR(serial_number, S_IRUGO,
210                 amdgpu_device_get_serial_number, NULL);
211
212 /**
213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
214  *
215  * @dev: drm_device pointer
216  *
217  * Returns true if the device is a dGPU with HG/PX power control,
218  * otherwise return false.
219  */
220 bool amdgpu_device_supports_boco(struct drm_device *dev)
221 {
222         struct amdgpu_device *adev = drm_to_adev(dev);
223
224         if (adev->flags & AMD_IS_PX)
225                 return true;
226         return false;
227 }
228
229 /**
230  * amdgpu_device_supports_baco - Does the device support BACO
231  *
232  * @dev: drm_device pointer
233  *
234  * Returns true if the device supporte BACO,
235  * otherwise return false.
236  */
237 bool amdgpu_device_supports_baco(struct drm_device *dev)
238 {
239         struct amdgpu_device *adev = drm_to_adev(dev);
240
241         return amdgpu_asic_supports_baco(adev);
242 }
243
244 /**
245  * VRAM access helper functions.
246  *
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256                                uint32_t *buf, size_t size, bool write)
257 {
258         unsigned long flags;
259         uint32_t hi = ~0;
260         uint64_t last;
261
262
263 #ifdef CONFIG_64BIT
264         last = min(pos + size, adev->gmc.visible_vram_size);
265         if (last > pos) {
266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267                 size_t count = last - pos;
268
269                 if (write) {
270                         memcpy_toio(addr, buf, count);
271                         mb();
272                         amdgpu_asic_flush_hdp(adev, NULL);
273                 } else {
274                         amdgpu_asic_invalidate_hdp(adev, NULL);
275                         mb();
276                         memcpy_fromio(buf, addr, count);
277                 }
278
279                 if (count == size)
280                         return;
281
282                 pos += count;
283                 buf += count / 4;
284                 size -= count;
285         }
286 #endif
287
288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289         for (last = pos + size; pos < last; pos += 4) {
290                 uint32_t tmp = pos >> 31;
291
292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293                 if (tmp != hi) {
294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295                         hi = tmp;
296                 }
297                 if (write)
298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
299                 else
300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
301         }
302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304
305 /*
306  * MMIO register access helper functions.
307  */
308 /**
309  * amdgpu_mm_rreg - read a memory mapped IO register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
318                         uint32_t acc_flags)
319 {
320         uint32_t ret;
321
322         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
323             down_read_trylock(&adev->reset_sem)) {
324                 ret = amdgpu_kiq_rreg(adev, reg);
325                 up_read(&adev->reset_sem);
326                 return ret;
327         }
328
329         if ((reg * 4) < adev->rmmio_size)
330                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
331         else {
332                 unsigned long flags;
333
334                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
335                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
336                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
337                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
338         }
339
340         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
341         return ret;
342 }
343
344 /*
345  * MMIO register read with bytes helper functions
346  * @offset:bytes offset from MMIO start
347  *
348 */
349
350 /**
351  * amdgpu_mm_rreg8 - read a memory mapped IO register
352  *
353  * @adev: amdgpu_device pointer
354  * @offset: byte aligned register offset
355  *
356  * Returns the 8 bit value from the offset specified.
357  */
358 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
359         if (offset < adev->rmmio_size)
360                 return (readb(adev->rmmio + offset));
361         BUG();
362 }
363
364 /*
365  * MMIO register write with bytes helper functions
366  * @offset:bytes offset from MMIO start
367  * @value: the value want to be written to the register
368  *
369 */
370 /**
371  * amdgpu_mm_wreg8 - read a memory mapped IO register
372  *
373  * @adev: amdgpu_device pointer
374  * @offset: byte aligned register offset
375  * @value: 8 bit value to write
376  *
377  * Writes the value specified to the offset specified.
378  */
379 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
380         if (offset < adev->rmmio_size)
381                 writeb(value, adev->rmmio + offset);
382         else
383                 BUG();
384 }
385
386 static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
387                                        uint32_t reg, uint32_t v,
388                                        uint32_t acc_flags)
389 {
390         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
391
392         if ((reg * 4) < adev->rmmio_size)
393                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
394         else {
395                 unsigned long flags;
396
397                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
398                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
399                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
400                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
401         }
402 }
403
404 /**
405  * amdgpu_mm_wreg - write to a memory mapped IO register
406  *
407  * @adev: amdgpu_device pointer
408  * @reg: dword aligned register offset
409  * @v: 32 bit value to write to the register
410  * @acc_flags: access flags which require special behavior
411  *
412  * Writes the value specified to the offset specified.
413  */
414 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
415                     uint32_t acc_flags)
416 {
417         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
418             down_read_trylock(&adev->reset_sem)) {
419                 amdgpu_kiq_wreg(adev, reg, v);
420                 up_read(&adev->reset_sem);
421                 return;
422         }
423
424         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
425 }
426
427 /*
428  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
429  *
430  * this function is invoked only the debugfs register access
431  * */
432 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
433                     uint32_t acc_flags)
434 {
435         if (amdgpu_sriov_fullaccess(adev) &&
436                 adev->gfx.rlc.funcs &&
437                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
438
439                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
440                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
441         }
442
443         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
444 }
445
446 /**
447  * amdgpu_io_rreg - read an IO register
448  *
449  * @adev: amdgpu_device pointer
450  * @reg: dword aligned register offset
451  *
452  * Returns the 32 bit value from the offset specified.
453  */
454 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
455 {
456         if ((reg * 4) < adev->rio_mem_size)
457                 return ioread32(adev->rio_mem + (reg * 4));
458         else {
459                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
460                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
461         }
462 }
463
464 /**
465  * amdgpu_io_wreg - write to an IO register
466  *
467  * @adev: amdgpu_device pointer
468  * @reg: dword aligned register offset
469  * @v: 32 bit value to write to the register
470  *
471  * Writes the value specified to the offset specified.
472  */
473 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
474 {
475         if ((reg * 4) < adev->rio_mem_size)
476                 iowrite32(v, adev->rio_mem + (reg * 4));
477         else {
478                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
479                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
480         }
481 }
482
483 /**
484  * amdgpu_mm_rdoorbell - read a doorbell dword
485  *
486  * @adev: amdgpu_device pointer
487  * @index: doorbell index
488  *
489  * Returns the value in the doorbell aperture at the
490  * requested doorbell index (CIK).
491  */
492 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
493 {
494         if (index < adev->doorbell.num_doorbells) {
495                 return readl(adev->doorbell.ptr + index);
496         } else {
497                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
498                 return 0;
499         }
500 }
501
502 /**
503  * amdgpu_mm_wdoorbell - write a doorbell dword
504  *
505  * @adev: amdgpu_device pointer
506  * @index: doorbell index
507  * @v: value to write
508  *
509  * Writes @v to the doorbell aperture at the
510  * requested doorbell index (CIK).
511  */
512 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
513 {
514         if (index < adev->doorbell.num_doorbells) {
515                 writel(v, adev->doorbell.ptr + index);
516         } else {
517                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
518         }
519 }
520
521 /**
522  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
523  *
524  * @adev: amdgpu_device pointer
525  * @index: doorbell index
526  *
527  * Returns the value in the doorbell aperture at the
528  * requested doorbell index (VEGA10+).
529  */
530 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
531 {
532         if (index < adev->doorbell.num_doorbells) {
533                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
534         } else {
535                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
536                 return 0;
537         }
538 }
539
540 /**
541  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
542  *
543  * @adev: amdgpu_device pointer
544  * @index: doorbell index
545  * @v: value to write
546  *
547  * Writes @v to the doorbell aperture at the
548  * requested doorbell index (VEGA10+).
549  */
550 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
551 {
552         if (index < adev->doorbell.num_doorbells) {
553                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
554         } else {
555                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
556         }
557 }
558
559 /**
560  * amdgpu_invalid_rreg - dummy reg read function
561  *
562  * @adev: amdgpu device pointer
563  * @reg: offset of register
564  *
565  * Dummy register read function.  Used for register blocks
566  * that certain asics don't have (all asics).
567  * Returns the value in the register.
568  */
569 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
570 {
571         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
572         BUG();
573         return 0;
574 }
575
576 /**
577  * amdgpu_invalid_wreg - dummy reg write function
578  *
579  * @adev: amdgpu device pointer
580  * @reg: offset of register
581  * @v: value to write to the register
582  *
583  * Dummy register read function.  Used for register blocks
584  * that certain asics don't have (all asics).
585  */
586 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
587 {
588         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
589                   reg, v);
590         BUG();
591 }
592
593 /**
594  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
595  *
596  * @adev: amdgpu device pointer
597  * @reg: offset of register
598  *
599  * Dummy register read function.  Used for register blocks
600  * that certain asics don't have (all asics).
601  * Returns the value in the register.
602  */
603 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
604 {
605         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
606         BUG();
607         return 0;
608 }
609
610 /**
611  * amdgpu_invalid_wreg64 - dummy reg write function
612  *
613  * @adev: amdgpu device pointer
614  * @reg: offset of register
615  * @v: value to write to the register
616  *
617  * Dummy register read function.  Used for register blocks
618  * that certain asics don't have (all asics).
619  */
620 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
621 {
622         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
623                   reg, v);
624         BUG();
625 }
626
627 /**
628  * amdgpu_block_invalid_rreg - dummy reg read function
629  *
630  * @adev: amdgpu device pointer
631  * @block: offset of instance
632  * @reg: offset of register
633  *
634  * Dummy register read function.  Used for register blocks
635  * that certain asics don't have (all asics).
636  * Returns the value in the register.
637  */
638 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
639                                           uint32_t block, uint32_t reg)
640 {
641         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
642                   reg, block);
643         BUG();
644         return 0;
645 }
646
647 /**
648  * amdgpu_block_invalid_wreg - dummy reg write function
649  *
650  * @adev: amdgpu device pointer
651  * @block: offset of instance
652  * @reg: offset of register
653  * @v: value to write to the register
654  *
655  * Dummy register read function.  Used for register blocks
656  * that certain asics don't have (all asics).
657  */
658 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
659                                       uint32_t block,
660                                       uint32_t reg, uint32_t v)
661 {
662         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
663                   reg, block, v);
664         BUG();
665 }
666
667 /**
668  * amdgpu_device_asic_init - Wrapper for atom asic_init
669  *
670  * @dev: drm_device pointer
671  *
672  * Does any asic specific work and then calls atom asic init.
673  */
674 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
675 {
676         amdgpu_asic_pre_asic_init(adev);
677
678         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
679 }
680
681 /**
682  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
683  *
684  * @adev: amdgpu device pointer
685  *
686  * Allocates a scratch page of VRAM for use by various things in the
687  * driver.
688  */
689 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
690 {
691         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
692                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
693                                        &adev->vram_scratch.robj,
694                                        &adev->vram_scratch.gpu_addr,
695                                        (void **)&adev->vram_scratch.ptr);
696 }
697
698 /**
699  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
700  *
701  * @adev: amdgpu device pointer
702  *
703  * Frees the VRAM scratch page.
704  */
705 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
706 {
707         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
708 }
709
710 /**
711  * amdgpu_device_program_register_sequence - program an array of registers.
712  *
713  * @adev: amdgpu_device pointer
714  * @registers: pointer to the register array
715  * @array_size: size of the register array
716  *
717  * Programs an array or registers with and and or masks.
718  * This is a helper for setting golden registers.
719  */
720 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
721                                              const u32 *registers,
722                                              const u32 array_size)
723 {
724         u32 tmp, reg, and_mask, or_mask;
725         int i;
726
727         if (array_size % 3)
728                 return;
729
730         for (i = 0; i < array_size; i +=3) {
731                 reg = registers[i + 0];
732                 and_mask = registers[i + 1];
733                 or_mask = registers[i + 2];
734
735                 if (and_mask == 0xffffffff) {
736                         tmp = or_mask;
737                 } else {
738                         tmp = RREG32(reg);
739                         tmp &= ~and_mask;
740                         if (adev->family >= AMDGPU_FAMILY_AI)
741                                 tmp |= (or_mask & and_mask);
742                         else
743                                 tmp |= or_mask;
744                 }
745                 WREG32(reg, tmp);
746         }
747 }
748
749 /**
750  * amdgpu_device_pci_config_reset - reset the GPU
751  *
752  * @adev: amdgpu_device pointer
753  *
754  * Resets the GPU using the pci config reset sequence.
755  * Only applicable to asics prior to vega10.
756  */
757 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
758 {
759         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
760 }
761
762 /*
763  * GPU doorbell aperture helpers function.
764  */
765 /**
766  * amdgpu_device_doorbell_init - Init doorbell driver information.
767  *
768  * @adev: amdgpu_device pointer
769  *
770  * Init doorbell driver information (CIK)
771  * Returns 0 on success, error on failure.
772  */
773 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
774 {
775
776         /* No doorbell on SI hardware generation */
777         if (adev->asic_type < CHIP_BONAIRE) {
778                 adev->doorbell.base = 0;
779                 adev->doorbell.size = 0;
780                 adev->doorbell.num_doorbells = 0;
781                 adev->doorbell.ptr = NULL;
782                 return 0;
783         }
784
785         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
786                 return -EINVAL;
787
788         amdgpu_asic_init_doorbell_index(adev);
789
790         /* doorbell bar mapping */
791         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
792         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
793
794         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
795                                              adev->doorbell_index.max_assignment+1);
796         if (adev->doorbell.num_doorbells == 0)
797                 return -EINVAL;
798
799         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
800          * paging queue doorbell use the second page. The
801          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
802          * doorbells are in the first page. So with paging queue enabled,
803          * the max num_doorbells should + 1 page (0x400 in dword)
804          */
805         if (adev->asic_type >= CHIP_VEGA10)
806                 adev->doorbell.num_doorbells += 0x400;
807
808         adev->doorbell.ptr = ioremap(adev->doorbell.base,
809                                      adev->doorbell.num_doorbells *
810                                      sizeof(u32));
811         if (adev->doorbell.ptr == NULL)
812                 return -ENOMEM;
813
814         return 0;
815 }
816
817 /**
818  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
819  *
820  * @adev: amdgpu_device pointer
821  *
822  * Tear down doorbell driver information (CIK)
823  */
824 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
825 {
826         iounmap(adev->doorbell.ptr);
827         adev->doorbell.ptr = NULL;
828 }
829
830
831
832 /*
833  * amdgpu_device_wb_*()
834  * Writeback is the method by which the GPU updates special pages in memory
835  * with the status of certain GPU events (fences, ring pointers,etc.).
836  */
837
838 /**
839  * amdgpu_device_wb_fini - Disable Writeback and free memory
840  *
841  * @adev: amdgpu_device pointer
842  *
843  * Disables Writeback and frees the Writeback memory (all asics).
844  * Used at driver shutdown.
845  */
846 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
847 {
848         if (adev->wb.wb_obj) {
849                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
850                                       &adev->wb.gpu_addr,
851                                       (void **)&adev->wb.wb);
852                 adev->wb.wb_obj = NULL;
853         }
854 }
855
856 /**
857  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
858  *
859  * @adev: amdgpu_device pointer
860  *
861  * Initializes writeback and allocates writeback memory (all asics).
862  * Used at driver startup.
863  * Returns 0 on success or an -error on failure.
864  */
865 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
866 {
867         int r;
868
869         if (adev->wb.wb_obj == NULL) {
870                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
871                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
872                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
873                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
874                                             (void **)&adev->wb.wb);
875                 if (r) {
876                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
877                         return r;
878                 }
879
880                 adev->wb.num_wb = AMDGPU_MAX_WB;
881                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
882
883                 /* clear wb memory */
884                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
885         }
886
887         return 0;
888 }
889
890 /**
891  * amdgpu_device_wb_get - Allocate a wb entry
892  *
893  * @adev: amdgpu_device pointer
894  * @wb: wb index
895  *
896  * Allocate a wb slot for use by the driver (all asics).
897  * Returns 0 on success or -EINVAL on failure.
898  */
899 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
900 {
901         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
902
903         if (offset < adev->wb.num_wb) {
904                 __set_bit(offset, adev->wb.used);
905                 *wb = offset << 3; /* convert to dw offset */
906                 return 0;
907         } else {
908                 return -EINVAL;
909         }
910 }
911
912 /**
913  * amdgpu_device_wb_free - Free a wb entry
914  *
915  * @adev: amdgpu_device pointer
916  * @wb: wb index
917  *
918  * Free a wb slot allocated for use by the driver (all asics)
919  */
920 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
921 {
922         wb >>= 3;
923         if (wb < adev->wb.num_wb)
924                 __clear_bit(wb, adev->wb.used);
925 }
926
927 /**
928  * amdgpu_device_resize_fb_bar - try to resize FB BAR
929  *
930  * @adev: amdgpu_device pointer
931  *
932  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
933  * to fail, but if any of the BARs is not accessible after the size we abort
934  * driver loading by returning -ENODEV.
935  */
936 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
937 {
938         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
939         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
940         struct pci_bus *root;
941         struct resource *res;
942         unsigned i;
943         u16 cmd;
944         int r;
945
946         /* Bypass for VF */
947         if (amdgpu_sriov_vf(adev))
948                 return 0;
949
950         /* skip if the bios has already enabled large BAR */
951         if (adev->gmc.real_vram_size &&
952             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
953                 return 0;
954
955         /* Check if the root BUS has 64bit memory resources */
956         root = adev->pdev->bus;
957         while (root->parent)
958                 root = root->parent;
959
960         pci_bus_for_each_resource(root, res, i) {
961                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
962                     res->start > 0x100000000ull)
963                         break;
964         }
965
966         /* Trying to resize is pointless without a root hub window above 4GB */
967         if (!res)
968                 return 0;
969
970         /* Disable memory decoding while we change the BAR addresses and size */
971         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
972         pci_write_config_word(adev->pdev, PCI_COMMAND,
973                               cmd & ~PCI_COMMAND_MEMORY);
974
975         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
976         amdgpu_device_doorbell_fini(adev);
977         if (adev->asic_type >= CHIP_BONAIRE)
978                 pci_release_resource(adev->pdev, 2);
979
980         pci_release_resource(adev->pdev, 0);
981
982         r = pci_resize_resource(adev->pdev, 0, rbar_size);
983         if (r == -ENOSPC)
984                 DRM_INFO("Not enough PCI address space for a large BAR.");
985         else if (r && r != -ENOTSUPP)
986                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
987
988         pci_assign_unassigned_bus_resources(adev->pdev->bus);
989
990         /* When the doorbell or fb BAR isn't available we have no chance of
991          * using the device.
992          */
993         r = amdgpu_device_doorbell_init(adev);
994         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
995                 return -ENODEV;
996
997         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
998
999         return 0;
1000 }
1001
1002 /*
1003  * GPU helpers function.
1004  */
1005 /**
1006  * amdgpu_device_need_post - check if the hw need post or not
1007  *
1008  * @adev: amdgpu_device pointer
1009  *
1010  * Check if the asic has been initialized (all asics) at driver startup
1011  * or post is needed if  hw reset is performed.
1012  * Returns true if need or false if not.
1013  */
1014 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1015 {
1016         uint32_t reg;
1017
1018         if (amdgpu_sriov_vf(adev))
1019                 return false;
1020
1021         if (amdgpu_passthrough(adev)) {
1022                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1023                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1024                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1025                  * vpost executed for smc version below 22.15
1026                  */
1027                 if (adev->asic_type == CHIP_FIJI) {
1028                         int err;
1029                         uint32_t fw_ver;
1030                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1031                         /* force vPost if error occured */
1032                         if (err)
1033                                 return true;
1034
1035                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1036                         if (fw_ver < 0x00160e00)
1037                                 return true;
1038                 }
1039         }
1040
1041         if (adev->has_hw_reset) {
1042                 adev->has_hw_reset = false;
1043                 return true;
1044         }
1045
1046         /* bios scratch used on CIK+ */
1047         if (adev->asic_type >= CHIP_BONAIRE)
1048                 return amdgpu_atombios_scratch_need_asic_init(adev);
1049
1050         /* check MEM_SIZE for older asics */
1051         reg = amdgpu_asic_get_config_memsize(adev);
1052
1053         if ((reg != 0) && (reg != 0xffffffff))
1054                 return false;
1055
1056         return true;
1057 }
1058
1059 /* if we get transitioned to only one device, take VGA back */
1060 /**
1061  * amdgpu_device_vga_set_decode - enable/disable vga decode
1062  *
1063  * @cookie: amdgpu_device pointer
1064  * @state: enable/disable vga decode
1065  *
1066  * Enable/disable vga decode (all asics).
1067  * Returns VGA resource flags.
1068  */
1069 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1070 {
1071         struct amdgpu_device *adev = cookie;
1072         amdgpu_asic_set_vga_state(adev, state);
1073         if (state)
1074                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1075                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1076         else
1077                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1078 }
1079
1080 /**
1081  * amdgpu_device_check_block_size - validate the vm block size
1082  *
1083  * @adev: amdgpu_device pointer
1084  *
1085  * Validates the vm block size specified via module parameter.
1086  * The vm block size defines number of bits in page table versus page directory,
1087  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1088  * page table and the remaining bits are in the page directory.
1089  */
1090 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1091 {
1092         /* defines number of bits in page table versus page directory,
1093          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1094          * page table and the remaining bits are in the page directory */
1095         if (amdgpu_vm_block_size == -1)
1096                 return;
1097
1098         if (amdgpu_vm_block_size < 9) {
1099                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1100                          amdgpu_vm_block_size);
1101                 amdgpu_vm_block_size = -1;
1102         }
1103 }
1104
1105 /**
1106  * amdgpu_device_check_vm_size - validate the vm size
1107  *
1108  * @adev: amdgpu_device pointer
1109  *
1110  * Validates the vm size in GB specified via module parameter.
1111  * The VM size is the size of the GPU virtual memory space in GB.
1112  */
1113 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1114 {
1115         /* no need to check the default value */
1116         if (amdgpu_vm_size == -1)
1117                 return;
1118
1119         if (amdgpu_vm_size < 1) {
1120                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1121                          amdgpu_vm_size);
1122                 amdgpu_vm_size = -1;
1123         }
1124 }
1125
1126 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1127 {
1128         struct sysinfo si;
1129         bool is_os_64 = (sizeof(void *) == 8);
1130         uint64_t total_memory;
1131         uint64_t dram_size_seven_GB = 0x1B8000000;
1132         uint64_t dram_size_three_GB = 0xB8000000;
1133
1134         if (amdgpu_smu_memory_pool_size == 0)
1135                 return;
1136
1137         if (!is_os_64) {
1138                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1139                 goto def_value;
1140         }
1141         si_meminfo(&si);
1142         total_memory = (uint64_t)si.totalram * si.mem_unit;
1143
1144         if ((amdgpu_smu_memory_pool_size == 1) ||
1145                 (amdgpu_smu_memory_pool_size == 2)) {
1146                 if (total_memory < dram_size_three_GB)
1147                         goto def_value1;
1148         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1149                 (amdgpu_smu_memory_pool_size == 8)) {
1150                 if (total_memory < dram_size_seven_GB)
1151                         goto def_value1;
1152         } else {
1153                 DRM_WARN("Smu memory pool size not supported\n");
1154                 goto def_value;
1155         }
1156         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1157
1158         return;
1159
1160 def_value1:
1161         DRM_WARN("No enough system memory\n");
1162 def_value:
1163         adev->pm.smu_prv_buffer_size = 0;
1164 }
1165
1166 /**
1167  * amdgpu_device_check_arguments - validate module params
1168  *
1169  * @adev: amdgpu_device pointer
1170  *
1171  * Validates certain module parameters and updates
1172  * the associated values used by the driver (all asics).
1173  */
1174 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1175 {
1176         if (amdgpu_sched_jobs < 4) {
1177                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1178                          amdgpu_sched_jobs);
1179                 amdgpu_sched_jobs = 4;
1180         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1181                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1182                          amdgpu_sched_jobs);
1183                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1184         }
1185
1186         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1187                 /* gart size must be greater or equal to 32M */
1188                 dev_warn(adev->dev, "gart size (%d) too small\n",
1189                          amdgpu_gart_size);
1190                 amdgpu_gart_size = -1;
1191         }
1192
1193         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1194                 /* gtt size must be greater or equal to 32M */
1195                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1196                                  amdgpu_gtt_size);
1197                 amdgpu_gtt_size = -1;
1198         }
1199
1200         /* valid range is between 4 and 9 inclusive */
1201         if (amdgpu_vm_fragment_size != -1 &&
1202             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1203                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1204                 amdgpu_vm_fragment_size = -1;
1205         }
1206
1207         if (amdgpu_sched_hw_submission < 2) {
1208                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1209                          amdgpu_sched_hw_submission);
1210                 amdgpu_sched_hw_submission = 2;
1211         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1212                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1213                          amdgpu_sched_hw_submission);
1214                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1215         }
1216
1217         amdgpu_device_check_smu_prv_buffer_size(adev);
1218
1219         amdgpu_device_check_vm_size(adev);
1220
1221         amdgpu_device_check_block_size(adev);
1222
1223         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1224
1225         amdgpu_gmc_tmz_set(adev);
1226
1227         if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1228                 amdgpu_num_kcq = 8;
1229                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1230         }
1231
1232         return 0;
1233 }
1234
1235 /**
1236  * amdgpu_switcheroo_set_state - set switcheroo state
1237  *
1238  * @pdev: pci dev pointer
1239  * @state: vga_switcheroo state
1240  *
1241  * Callback for the switcheroo driver.  Suspends or resumes the
1242  * the asics before or after it is powered up using ACPI methods.
1243  */
1244 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1245                                         enum vga_switcheroo_state state)
1246 {
1247         struct drm_device *dev = pci_get_drvdata(pdev);
1248         int r;
1249
1250         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1251                 return;
1252
1253         if (state == VGA_SWITCHEROO_ON) {
1254                 pr_info("switched on\n");
1255                 /* don't suspend or resume card normally */
1256                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1257
1258                 pci_set_power_state(dev->pdev, PCI_D0);
1259                 pci_restore_state(dev->pdev);
1260                 r = pci_enable_device(dev->pdev);
1261                 if (r)
1262                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1263                 amdgpu_device_resume(dev, true);
1264
1265                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1266                 drm_kms_helper_poll_enable(dev);
1267         } else {
1268                 pr_info("switched off\n");
1269                 drm_kms_helper_poll_disable(dev);
1270                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1271                 amdgpu_device_suspend(dev, true);
1272                 pci_save_state(dev->pdev);
1273                 /* Shut down the device */
1274                 pci_disable_device(dev->pdev);
1275                 pci_set_power_state(dev->pdev, PCI_D3cold);
1276                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1277         }
1278 }
1279
1280 /**
1281  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1282  *
1283  * @pdev: pci dev pointer
1284  *
1285  * Callback for the switcheroo driver.  Check of the switcheroo
1286  * state can be changed.
1287  * Returns true if the state can be changed, false if not.
1288  */
1289 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1290 {
1291         struct drm_device *dev = pci_get_drvdata(pdev);
1292
1293         /*
1294         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1295         * locking inversion with the driver load path. And the access here is
1296         * completely racy anyway. So don't bother with locking for now.
1297         */
1298         return atomic_read(&dev->open_count) == 0;
1299 }
1300
1301 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1302         .set_gpu_state = amdgpu_switcheroo_set_state,
1303         .reprobe = NULL,
1304         .can_switch = amdgpu_switcheroo_can_switch,
1305 };
1306
1307 /**
1308  * amdgpu_device_ip_set_clockgating_state - set the CG state
1309  *
1310  * @dev: amdgpu_device pointer
1311  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1312  * @state: clockgating state (gate or ungate)
1313  *
1314  * Sets the requested clockgating state for all instances of
1315  * the hardware IP specified.
1316  * Returns the error code from the last instance.
1317  */
1318 int amdgpu_device_ip_set_clockgating_state(void *dev,
1319                                            enum amd_ip_block_type block_type,
1320                                            enum amd_clockgating_state state)
1321 {
1322         struct amdgpu_device *adev = dev;
1323         int i, r = 0;
1324
1325         for (i = 0; i < adev->num_ip_blocks; i++) {
1326                 if (!adev->ip_blocks[i].status.valid)
1327                         continue;
1328                 if (adev->ip_blocks[i].version->type != block_type)
1329                         continue;
1330                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1331                         continue;
1332                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1333                         (void *)adev, state);
1334                 if (r)
1335                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1336                                   adev->ip_blocks[i].version->funcs->name, r);
1337         }
1338         return r;
1339 }
1340
1341 /**
1342  * amdgpu_device_ip_set_powergating_state - set the PG state
1343  *
1344  * @dev: amdgpu_device pointer
1345  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1346  * @state: powergating state (gate or ungate)
1347  *
1348  * Sets the requested powergating state for all instances of
1349  * the hardware IP specified.
1350  * Returns the error code from the last instance.
1351  */
1352 int amdgpu_device_ip_set_powergating_state(void *dev,
1353                                            enum amd_ip_block_type block_type,
1354                                            enum amd_powergating_state state)
1355 {
1356         struct amdgpu_device *adev = dev;
1357         int i, r = 0;
1358
1359         for (i = 0; i < adev->num_ip_blocks; i++) {
1360                 if (!adev->ip_blocks[i].status.valid)
1361                         continue;
1362                 if (adev->ip_blocks[i].version->type != block_type)
1363                         continue;
1364                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1365                         continue;
1366                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1367                         (void *)adev, state);
1368                 if (r)
1369                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1370                                   adev->ip_blocks[i].version->funcs->name, r);
1371         }
1372         return r;
1373 }
1374
1375 /**
1376  * amdgpu_device_ip_get_clockgating_state - get the CG state
1377  *
1378  * @adev: amdgpu_device pointer
1379  * @flags: clockgating feature flags
1380  *
1381  * Walks the list of IPs on the device and updates the clockgating
1382  * flags for each IP.
1383  * Updates @flags with the feature flags for each hardware IP where
1384  * clockgating is enabled.
1385  */
1386 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1387                                             u32 *flags)
1388 {
1389         int i;
1390
1391         for (i = 0; i < adev->num_ip_blocks; i++) {
1392                 if (!adev->ip_blocks[i].status.valid)
1393                         continue;
1394                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1395                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1396         }
1397 }
1398
1399 /**
1400  * amdgpu_device_ip_wait_for_idle - wait for idle
1401  *
1402  * @adev: amdgpu_device pointer
1403  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1404  *
1405  * Waits for the request hardware IP to be idle.
1406  * Returns 0 for success or a negative error code on failure.
1407  */
1408 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1409                                    enum amd_ip_block_type block_type)
1410 {
1411         int i, r;
1412
1413         for (i = 0; i < adev->num_ip_blocks; i++) {
1414                 if (!adev->ip_blocks[i].status.valid)
1415                         continue;
1416                 if (adev->ip_blocks[i].version->type == block_type) {
1417                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1418                         if (r)
1419                                 return r;
1420                         break;
1421                 }
1422         }
1423         return 0;
1424
1425 }
1426
1427 /**
1428  * amdgpu_device_ip_is_idle - is the hardware IP idle
1429  *
1430  * @adev: amdgpu_device pointer
1431  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1432  *
1433  * Check if the hardware IP is idle or not.
1434  * Returns true if it the IP is idle, false if not.
1435  */
1436 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1437                               enum amd_ip_block_type block_type)
1438 {
1439         int i;
1440
1441         for (i = 0; i < adev->num_ip_blocks; i++) {
1442                 if (!adev->ip_blocks[i].status.valid)
1443                         continue;
1444                 if (adev->ip_blocks[i].version->type == block_type)
1445                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1446         }
1447         return true;
1448
1449 }
1450
1451 /**
1452  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1453  *
1454  * @adev: amdgpu_device pointer
1455  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1456  *
1457  * Returns a pointer to the hardware IP block structure
1458  * if it exists for the asic, otherwise NULL.
1459  */
1460 struct amdgpu_ip_block *
1461 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1462                               enum amd_ip_block_type type)
1463 {
1464         int i;
1465
1466         for (i = 0; i < adev->num_ip_blocks; i++)
1467                 if (adev->ip_blocks[i].version->type == type)
1468                         return &adev->ip_blocks[i];
1469
1470         return NULL;
1471 }
1472
1473 /**
1474  * amdgpu_device_ip_block_version_cmp
1475  *
1476  * @adev: amdgpu_device pointer
1477  * @type: enum amd_ip_block_type
1478  * @major: major version
1479  * @minor: minor version
1480  *
1481  * return 0 if equal or greater
1482  * return 1 if smaller or the ip_block doesn't exist
1483  */
1484 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1485                                        enum amd_ip_block_type type,
1486                                        u32 major, u32 minor)
1487 {
1488         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1489
1490         if (ip_block && ((ip_block->version->major > major) ||
1491                         ((ip_block->version->major == major) &&
1492                         (ip_block->version->minor >= minor))))
1493                 return 0;
1494
1495         return 1;
1496 }
1497
1498 /**
1499  * amdgpu_device_ip_block_add
1500  *
1501  * @adev: amdgpu_device pointer
1502  * @ip_block_version: pointer to the IP to add
1503  *
1504  * Adds the IP block driver information to the collection of IPs
1505  * on the asic.
1506  */
1507 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1508                                const struct amdgpu_ip_block_version *ip_block_version)
1509 {
1510         if (!ip_block_version)
1511                 return -EINVAL;
1512
1513         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1514                   ip_block_version->funcs->name);
1515
1516         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1517
1518         return 0;
1519 }
1520
1521 /**
1522  * amdgpu_device_enable_virtual_display - enable virtual display feature
1523  *
1524  * @adev: amdgpu_device pointer
1525  *
1526  * Enabled the virtual display feature if the user has enabled it via
1527  * the module parameter virtual_display.  This feature provides a virtual
1528  * display hardware on headless boards or in virtualized environments.
1529  * This function parses and validates the configuration string specified by
1530  * the user and configues the virtual display configuration (number of
1531  * virtual connectors, crtcs, etc.) specified.
1532  */
1533 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1534 {
1535         adev->enable_virtual_display = false;
1536
1537         if (amdgpu_virtual_display) {
1538                 struct drm_device *ddev = adev_to_drm(adev);
1539                 const char *pci_address_name = pci_name(ddev->pdev);
1540                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1541
1542                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1543                 pciaddstr_tmp = pciaddstr;
1544                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1545                         pciaddname = strsep(&pciaddname_tmp, ",");
1546                         if (!strcmp("all", pciaddname)
1547                             || !strcmp(pci_address_name, pciaddname)) {
1548                                 long num_crtc;
1549                                 int res = -1;
1550
1551                                 adev->enable_virtual_display = true;
1552
1553                                 if (pciaddname_tmp)
1554                                         res = kstrtol(pciaddname_tmp, 10,
1555                                                       &num_crtc);
1556
1557                                 if (!res) {
1558                                         if (num_crtc < 1)
1559                                                 num_crtc = 1;
1560                                         if (num_crtc > 6)
1561                                                 num_crtc = 6;
1562                                         adev->mode_info.num_crtc = num_crtc;
1563                                 } else {
1564                                         adev->mode_info.num_crtc = 1;
1565                                 }
1566                                 break;
1567                         }
1568                 }
1569
1570                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1571                          amdgpu_virtual_display, pci_address_name,
1572                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1573
1574                 kfree(pciaddstr);
1575         }
1576 }
1577
1578 /**
1579  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1580  *
1581  * @adev: amdgpu_device pointer
1582  *
1583  * Parses the asic configuration parameters specified in the gpu info
1584  * firmware and makes them availale to the driver for use in configuring
1585  * the asic.
1586  * Returns 0 on success, -EINVAL on failure.
1587  */
1588 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1589 {
1590         const char *chip_name;
1591         char fw_name[40];
1592         int err;
1593         const struct gpu_info_firmware_header_v1_0 *hdr;
1594
1595         adev->firmware.gpu_info_fw = NULL;
1596
1597         if (adev->mman.discovery_bin) {
1598                 amdgpu_discovery_get_gfx_info(adev);
1599
1600                 /*
1601                  * FIXME: The bounding box is still needed by Navi12, so
1602                  * temporarily read it from gpu_info firmware. Should be droped
1603                  * when DAL no longer needs it.
1604                  */
1605                 if (adev->asic_type != CHIP_NAVI12)
1606                         return 0;
1607         }
1608
1609         switch (adev->asic_type) {
1610 #ifdef CONFIG_DRM_AMDGPU_SI
1611         case CHIP_VERDE:
1612         case CHIP_TAHITI:
1613         case CHIP_PITCAIRN:
1614         case CHIP_OLAND:
1615         case CHIP_HAINAN:
1616 #endif
1617 #ifdef CONFIG_DRM_AMDGPU_CIK
1618         case CHIP_BONAIRE:
1619         case CHIP_HAWAII:
1620         case CHIP_KAVERI:
1621         case CHIP_KABINI:
1622         case CHIP_MULLINS:
1623 #endif
1624         case CHIP_TOPAZ:
1625         case CHIP_TONGA:
1626         case CHIP_FIJI:
1627         case CHIP_POLARIS10:
1628         case CHIP_POLARIS11:
1629         case CHIP_POLARIS12:
1630         case CHIP_VEGAM:
1631         case CHIP_CARRIZO:
1632         case CHIP_STONEY:
1633         case CHIP_VEGA20:
1634         default:
1635                 return 0;
1636         case CHIP_VEGA10:
1637                 chip_name = "vega10";
1638                 break;
1639         case CHIP_VEGA12:
1640                 chip_name = "vega12";
1641                 break;
1642         case CHIP_RAVEN:
1643                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1644                         chip_name = "raven2";
1645                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1646                         chip_name = "picasso";
1647                 else
1648                         chip_name = "raven";
1649                 break;
1650         case CHIP_ARCTURUS:
1651                 chip_name = "arcturus";
1652                 break;
1653         case CHIP_RENOIR:
1654                 chip_name = "renoir";
1655                 break;
1656         case CHIP_NAVI10:
1657                 chip_name = "navi10";
1658                 break;
1659         case CHIP_NAVI14:
1660                 chip_name = "navi14";
1661                 break;
1662         case CHIP_NAVI12:
1663                 chip_name = "navi12";
1664                 break;
1665         case CHIP_SIENNA_CICHLID:
1666                 chip_name = "sienna_cichlid";
1667                 break;
1668         case CHIP_NAVY_FLOUNDER:
1669                 chip_name = "navy_flounder";
1670                 break;
1671         }
1672
1673         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1674         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1675         if (err) {
1676                 dev_err(adev->dev,
1677                         "Failed to load gpu_info firmware \"%s\"\n",
1678                         fw_name);
1679                 goto out;
1680         }
1681         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1682         if (err) {
1683                 dev_err(adev->dev,
1684                         "Failed to validate gpu_info firmware \"%s\"\n",
1685                         fw_name);
1686                 goto out;
1687         }
1688
1689         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1690         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1691
1692         switch (hdr->version_major) {
1693         case 1:
1694         {
1695                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1696                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1697                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1698
1699                 /*
1700                  * Should be droped when DAL no longer needs it.
1701                  */
1702                 if (adev->asic_type == CHIP_NAVI12)
1703                         goto parse_soc_bounding_box;
1704
1705                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1706                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1707                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1708                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1709                 adev->gfx.config.max_texture_channel_caches =
1710                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1711                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1712                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1713                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1714                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1715                 adev->gfx.config.double_offchip_lds_buf =
1716                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1717                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1718                 adev->gfx.cu_info.max_waves_per_simd =
1719                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1720                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1721                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1722                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1723                 if (hdr->version_minor >= 1) {
1724                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1725                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1726                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1727                         adev->gfx.config.num_sc_per_sh =
1728                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1729                         adev->gfx.config.num_packer_per_sc =
1730                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1731                 }
1732
1733 parse_soc_bounding_box:
1734                 /*
1735                  * soc bounding box info is not integrated in disocovery table,
1736                  * we always need to parse it from gpu info firmware if needed.
1737                  */
1738                 if (hdr->version_minor == 2) {
1739                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1740                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1741                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1742                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1743                 }
1744                 break;
1745         }
1746         default:
1747                 dev_err(adev->dev,
1748                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1749                 err = -EINVAL;
1750                 goto out;
1751         }
1752 out:
1753         return err;
1754 }
1755
1756 /**
1757  * amdgpu_device_ip_early_init - run early init for hardware IPs
1758  *
1759  * @adev: amdgpu_device pointer
1760  *
1761  * Early initialization pass for hardware IPs.  The hardware IPs that make
1762  * up each asic are discovered each IP's early_init callback is run.  This
1763  * is the first stage in initializing the asic.
1764  * Returns 0 on success, negative error code on failure.
1765  */
1766 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1767 {
1768         int i, r;
1769
1770         amdgpu_device_enable_virtual_display(adev);
1771
1772         if (amdgpu_sriov_vf(adev)) {
1773                 r = amdgpu_virt_request_full_gpu(adev, true);
1774                 if (r)
1775                         return r;
1776         }
1777
1778         switch (adev->asic_type) {
1779 #ifdef CONFIG_DRM_AMDGPU_SI
1780         case CHIP_VERDE:
1781         case CHIP_TAHITI:
1782         case CHIP_PITCAIRN:
1783         case CHIP_OLAND:
1784         case CHIP_HAINAN:
1785                 adev->family = AMDGPU_FAMILY_SI;
1786                 r = si_set_ip_blocks(adev);
1787                 if (r)
1788                         return r;
1789                 break;
1790 #endif
1791 #ifdef CONFIG_DRM_AMDGPU_CIK
1792         case CHIP_BONAIRE:
1793         case CHIP_HAWAII:
1794         case CHIP_KAVERI:
1795         case CHIP_KABINI:
1796         case CHIP_MULLINS:
1797                 if (adev->flags & AMD_IS_APU)
1798                         adev->family = AMDGPU_FAMILY_KV;
1799                 else
1800                         adev->family = AMDGPU_FAMILY_CI;
1801
1802                 r = cik_set_ip_blocks(adev);
1803                 if (r)
1804                         return r;
1805                 break;
1806 #endif
1807         case CHIP_TOPAZ:
1808         case CHIP_TONGA:
1809         case CHIP_FIJI:
1810         case CHIP_POLARIS10:
1811         case CHIP_POLARIS11:
1812         case CHIP_POLARIS12:
1813         case CHIP_VEGAM:
1814         case CHIP_CARRIZO:
1815         case CHIP_STONEY:
1816                 if (adev->flags & AMD_IS_APU)
1817                         adev->family = AMDGPU_FAMILY_CZ;
1818                 else
1819                         adev->family = AMDGPU_FAMILY_VI;
1820
1821                 r = vi_set_ip_blocks(adev);
1822                 if (r)
1823                         return r;
1824                 break;
1825         case CHIP_VEGA10:
1826         case CHIP_VEGA12:
1827         case CHIP_VEGA20:
1828         case CHIP_RAVEN:
1829         case CHIP_ARCTURUS:
1830         case CHIP_RENOIR:
1831                 if (adev->flags & AMD_IS_APU)
1832                         adev->family = AMDGPU_FAMILY_RV;
1833                 else
1834                         adev->family = AMDGPU_FAMILY_AI;
1835
1836                 r = soc15_set_ip_blocks(adev);
1837                 if (r)
1838                         return r;
1839                 break;
1840         case  CHIP_NAVI10:
1841         case  CHIP_NAVI14:
1842         case  CHIP_NAVI12:
1843         case  CHIP_SIENNA_CICHLID:
1844         case  CHIP_NAVY_FLOUNDER:
1845                 adev->family = AMDGPU_FAMILY_NV;
1846
1847                 r = nv_set_ip_blocks(adev);
1848                 if (r)
1849                         return r;
1850                 break;
1851         default:
1852                 /* FIXME: not supported yet */
1853                 return -EINVAL;
1854         }
1855
1856         amdgpu_amdkfd_device_probe(adev);
1857
1858         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1859         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1860                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1861
1862         for (i = 0; i < adev->num_ip_blocks; i++) {
1863                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1864                         DRM_ERROR("disabled ip block: %d <%s>\n",
1865                                   i, adev->ip_blocks[i].version->funcs->name);
1866                         adev->ip_blocks[i].status.valid = false;
1867                 } else {
1868                         if (adev->ip_blocks[i].version->funcs->early_init) {
1869                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1870                                 if (r == -ENOENT) {
1871                                         adev->ip_blocks[i].status.valid = false;
1872                                 } else if (r) {
1873                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1874                                                   adev->ip_blocks[i].version->funcs->name, r);
1875                                         return r;
1876                                 } else {
1877                                         adev->ip_blocks[i].status.valid = true;
1878                                 }
1879                         } else {
1880                                 adev->ip_blocks[i].status.valid = true;
1881                         }
1882                 }
1883                 /* get the vbios after the asic_funcs are set up */
1884                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1885                         r = amdgpu_device_parse_gpu_info_fw(adev);
1886                         if (r)
1887                                 return r;
1888
1889                         /* Read BIOS */
1890                         if (!amdgpu_get_bios(adev))
1891                                 return -EINVAL;
1892
1893                         r = amdgpu_atombios_init(adev);
1894                         if (r) {
1895                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1896                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1897                                 return r;
1898                         }
1899                 }
1900         }
1901
1902         adev->cg_flags &= amdgpu_cg_mask;
1903         adev->pg_flags &= amdgpu_pg_mask;
1904
1905         return 0;
1906 }
1907
1908 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1909 {
1910         int i, r;
1911
1912         for (i = 0; i < adev->num_ip_blocks; i++) {
1913                 if (!adev->ip_blocks[i].status.sw)
1914                         continue;
1915                 if (adev->ip_blocks[i].status.hw)
1916                         continue;
1917                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1918                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1919                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1920                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1921                         if (r) {
1922                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1923                                           adev->ip_blocks[i].version->funcs->name, r);
1924                                 return r;
1925                         }
1926                         adev->ip_blocks[i].status.hw = true;
1927                 }
1928         }
1929
1930         return 0;
1931 }
1932
1933 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1934 {
1935         int i, r;
1936
1937         for (i = 0; i < adev->num_ip_blocks; i++) {
1938                 if (!adev->ip_blocks[i].status.sw)
1939                         continue;
1940                 if (adev->ip_blocks[i].status.hw)
1941                         continue;
1942                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1943                 if (r) {
1944                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1945                                   adev->ip_blocks[i].version->funcs->name, r);
1946                         return r;
1947                 }
1948                 adev->ip_blocks[i].status.hw = true;
1949         }
1950
1951         return 0;
1952 }
1953
1954 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1955 {
1956         int r = 0;
1957         int i;
1958         uint32_t smu_version;
1959
1960         if (adev->asic_type >= CHIP_VEGA10) {
1961                 for (i = 0; i < adev->num_ip_blocks; i++) {
1962                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1963                                 continue;
1964
1965                         /* no need to do the fw loading again if already done*/
1966                         if (adev->ip_blocks[i].status.hw == true)
1967                                 break;
1968
1969                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
1970                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1971                                 if (r) {
1972                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1973                                                           adev->ip_blocks[i].version->funcs->name, r);
1974                                         return r;
1975                                 }
1976                         } else {
1977                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1978                                 if (r) {
1979                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1980                                                           adev->ip_blocks[i].version->funcs->name, r);
1981                                         return r;
1982                                 }
1983                         }
1984
1985                         adev->ip_blocks[i].status.hw = true;
1986                         break;
1987                 }
1988         }
1989
1990         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1991                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1992
1993         return r;
1994 }
1995
1996 /**
1997  * amdgpu_device_ip_init - run init for hardware IPs
1998  *
1999  * @adev: amdgpu_device pointer
2000  *
2001  * Main initialization pass for hardware IPs.  The list of all the hardware
2002  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2003  * are run.  sw_init initializes the software state associated with each IP
2004  * and hw_init initializes the hardware associated with each IP.
2005  * Returns 0 on success, negative error code on failure.
2006  */
2007 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2008 {
2009         int i, r;
2010
2011         r = amdgpu_ras_init(adev);
2012         if (r)
2013                 return r;
2014
2015         for (i = 0; i < adev->num_ip_blocks; i++) {
2016                 if (!adev->ip_blocks[i].status.valid)
2017                         continue;
2018                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2019                 if (r) {
2020                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2021                                   adev->ip_blocks[i].version->funcs->name, r);
2022                         goto init_failed;
2023                 }
2024                 adev->ip_blocks[i].status.sw = true;
2025
2026                 /* need to do gmc hw init early so we can allocate gpu mem */
2027                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2028                         r = amdgpu_device_vram_scratch_init(adev);
2029                         if (r) {
2030                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2031                                 goto init_failed;
2032                         }
2033                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2034                         if (r) {
2035                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2036                                 goto init_failed;
2037                         }
2038                         r = amdgpu_device_wb_init(adev);
2039                         if (r) {
2040                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2041                                 goto init_failed;
2042                         }
2043                         adev->ip_blocks[i].status.hw = true;
2044
2045                         /* right after GMC hw init, we create CSA */
2046                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2047                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2048                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2049                                                                 AMDGPU_CSA_SIZE);
2050                                 if (r) {
2051                                         DRM_ERROR("allocate CSA failed %d\n", r);
2052                                         goto init_failed;
2053                                 }
2054                         }
2055                 }
2056         }
2057
2058         if (amdgpu_sriov_vf(adev))
2059                 amdgpu_virt_init_data_exchange(adev);
2060
2061         r = amdgpu_ib_pool_init(adev);
2062         if (r) {
2063                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2064                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2065                 goto init_failed;
2066         }
2067
2068         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2069         if (r)
2070                 goto init_failed;
2071
2072         r = amdgpu_device_ip_hw_init_phase1(adev);
2073         if (r)
2074                 goto init_failed;
2075
2076         r = amdgpu_device_fw_loading(adev);
2077         if (r)
2078                 goto init_failed;
2079
2080         r = amdgpu_device_ip_hw_init_phase2(adev);
2081         if (r)
2082                 goto init_failed;
2083
2084         /*
2085          * retired pages will be loaded from eeprom and reserved here,
2086          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2087          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2088          * for I2C communication which only true at this point.
2089          *
2090          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2091          * failure from bad gpu situation and stop amdgpu init process
2092          * accordingly. For other failed cases, it will still release all
2093          * the resource and print error message, rather than returning one
2094          * negative value to upper level.
2095          *
2096          * Note: theoretically, this should be called before all vram allocations
2097          * to protect retired page from abusing
2098          */
2099         r = amdgpu_ras_recovery_init(adev);
2100         if (r)
2101                 goto init_failed;
2102
2103         if (adev->gmc.xgmi.num_physical_nodes > 1)
2104                 amdgpu_xgmi_add_device(adev);
2105         amdgpu_amdkfd_device_init(adev);
2106
2107         amdgpu_fru_get_product_info(adev);
2108
2109 init_failed:
2110         if (amdgpu_sriov_vf(adev))
2111                 amdgpu_virt_release_full_gpu(adev, true);
2112
2113         return r;
2114 }
2115
2116 /**
2117  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2118  *
2119  * @adev: amdgpu_device pointer
2120  *
2121  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2122  * this function before a GPU reset.  If the value is retained after a
2123  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2124  */
2125 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2126 {
2127         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2128 }
2129
2130 /**
2131  * amdgpu_device_check_vram_lost - check if vram is valid
2132  *
2133  * @adev: amdgpu_device pointer
2134  *
2135  * Checks the reset magic value written to the gart pointer in VRAM.
2136  * The driver calls this after a GPU reset to see if the contents of
2137  * VRAM is lost or now.
2138  * returns true if vram is lost, false if not.
2139  */
2140 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2141 {
2142         if (memcmp(adev->gart.ptr, adev->reset_magic,
2143                         AMDGPU_RESET_MAGIC_NUM))
2144                 return true;
2145
2146         if (!amdgpu_in_reset(adev))
2147                 return false;
2148
2149         /*
2150          * For all ASICs with baco/mode1 reset, the VRAM is
2151          * always assumed to be lost.
2152          */
2153         switch (amdgpu_asic_reset_method(adev)) {
2154         case AMD_RESET_METHOD_BACO:
2155         case AMD_RESET_METHOD_MODE1:
2156                 return true;
2157         default:
2158                 return false;
2159         }
2160 }
2161
2162 /**
2163  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2164  *
2165  * @adev: amdgpu_device pointer
2166  * @state: clockgating state (gate or ungate)
2167  *
2168  * The list of all the hardware IPs that make up the asic is walked and the
2169  * set_clockgating_state callbacks are run.
2170  * Late initialization pass enabling clockgating for hardware IPs.
2171  * Fini or suspend, pass disabling clockgating for hardware IPs.
2172  * Returns 0 on success, negative error code on failure.
2173  */
2174
2175 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2176                                                 enum amd_clockgating_state state)
2177 {
2178         int i, j, r;
2179
2180         if (amdgpu_emu_mode == 1)
2181                 return 0;
2182
2183         for (j = 0; j < adev->num_ip_blocks; j++) {
2184                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2185                 if (!adev->ip_blocks[i].status.late_initialized)
2186                         continue;
2187                 /* skip CG for VCE/UVD, it's handled specially */
2188                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2189                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2190                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2191                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2192                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2193                         /* enable clockgating to save power */
2194                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2195                                                                                      state);
2196                         if (r) {
2197                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2198                                           adev->ip_blocks[i].version->funcs->name, r);
2199                                 return r;
2200                         }
2201                 }
2202         }
2203
2204         return 0;
2205 }
2206
2207 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2208 {
2209         int i, j, r;
2210
2211         if (amdgpu_emu_mode == 1)
2212                 return 0;
2213
2214         for (j = 0; j < adev->num_ip_blocks; j++) {
2215                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2216                 if (!adev->ip_blocks[i].status.late_initialized)
2217                         continue;
2218                 /* skip CG for VCE/UVD, it's handled specially */
2219                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2220                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2221                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2222                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2223                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2224                         /* enable powergating to save power */
2225                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2226                                                                                         state);
2227                         if (r) {
2228                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2229                                           adev->ip_blocks[i].version->funcs->name, r);
2230                                 return r;
2231                         }
2232                 }
2233         }
2234         return 0;
2235 }
2236
2237 static int amdgpu_device_enable_mgpu_fan_boost(void)
2238 {
2239         struct amdgpu_gpu_instance *gpu_ins;
2240         struct amdgpu_device *adev;
2241         int i, ret = 0;
2242
2243         mutex_lock(&mgpu_info.mutex);
2244
2245         /*
2246          * MGPU fan boost feature should be enabled
2247          * only when there are two or more dGPUs in
2248          * the system
2249          */
2250         if (mgpu_info.num_dgpu < 2)
2251                 goto out;
2252
2253         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2254                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2255                 adev = gpu_ins->adev;
2256                 if (!(adev->flags & AMD_IS_APU) &&
2257                     !gpu_ins->mgpu_fan_enabled) {
2258                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2259                         if (ret)
2260                                 break;
2261
2262                         gpu_ins->mgpu_fan_enabled = 1;
2263                 }
2264         }
2265
2266 out:
2267         mutex_unlock(&mgpu_info.mutex);
2268
2269         return ret;
2270 }
2271
2272 /**
2273  * amdgpu_device_ip_late_init - run late init for hardware IPs
2274  *
2275  * @adev: amdgpu_device pointer
2276  *
2277  * Late initialization pass for hardware IPs.  The list of all the hardware
2278  * IPs that make up the asic is walked and the late_init callbacks are run.
2279  * late_init covers any special initialization that an IP requires
2280  * after all of the have been initialized or something that needs to happen
2281  * late in the init process.
2282  * Returns 0 on success, negative error code on failure.
2283  */
2284 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2285 {
2286         struct amdgpu_gpu_instance *gpu_instance;
2287         int i = 0, r;
2288
2289         for (i = 0; i < adev->num_ip_blocks; i++) {
2290                 if (!adev->ip_blocks[i].status.hw)
2291                         continue;
2292                 if (adev->ip_blocks[i].version->funcs->late_init) {
2293                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2294                         if (r) {
2295                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2296                                           adev->ip_blocks[i].version->funcs->name, r);
2297                                 return r;
2298                         }
2299                 }
2300                 adev->ip_blocks[i].status.late_initialized = true;
2301         }
2302
2303         amdgpu_ras_set_error_query_ready(adev, true);
2304
2305         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2306         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2307
2308         amdgpu_device_fill_reset_magic(adev);
2309
2310         r = amdgpu_device_enable_mgpu_fan_boost();
2311         if (r)
2312                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2313
2314
2315         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2316                 mutex_lock(&mgpu_info.mutex);
2317
2318                 /*
2319                  * Reset device p-state to low as this was booted with high.
2320                  *
2321                  * This should be performed only after all devices from the same
2322                  * hive get initialized.
2323                  *
2324                  * However, it's unknown how many device in the hive in advance.
2325                  * As this is counted one by one during devices initializations.
2326                  *
2327                  * So, we wait for all XGMI interlinked devices initialized.
2328                  * This may bring some delays as those devices may come from
2329                  * different hives. But that should be OK.
2330                  */
2331                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2332                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2333                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2334                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2335                                         continue;
2336
2337                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2338                                                 AMDGPU_XGMI_PSTATE_MIN);
2339                                 if (r) {
2340                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2341                                         break;
2342                                 }
2343                         }
2344                 }
2345
2346                 mutex_unlock(&mgpu_info.mutex);
2347         }
2348
2349         return 0;
2350 }
2351
2352 /**
2353  * amdgpu_device_ip_fini - run fini for hardware IPs
2354  *
2355  * @adev: amdgpu_device pointer
2356  *
2357  * Main teardown pass for hardware IPs.  The list of all the hardware
2358  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2359  * are run.  hw_fini tears down the hardware associated with each IP
2360  * and sw_fini tears down any software state associated with each IP.
2361  * Returns 0 on success, negative error code on failure.
2362  */
2363 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2364 {
2365         int i, r;
2366
2367         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2368                 amdgpu_virt_release_ras_err_handler_data(adev);
2369
2370         amdgpu_ras_pre_fini(adev);
2371
2372         if (adev->gmc.xgmi.num_physical_nodes > 1)
2373                 amdgpu_xgmi_remove_device(adev);
2374
2375         amdgpu_amdkfd_device_fini(adev);
2376
2377         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2378         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2379
2380         /* need to disable SMC first */
2381         for (i = 0; i < adev->num_ip_blocks; i++) {
2382                 if (!adev->ip_blocks[i].status.hw)
2383                         continue;
2384                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2385                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2386                         /* XXX handle errors */
2387                         if (r) {
2388                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2389                                           adev->ip_blocks[i].version->funcs->name, r);
2390                         }
2391                         adev->ip_blocks[i].status.hw = false;
2392                         break;
2393                 }
2394         }
2395
2396         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2397                 if (!adev->ip_blocks[i].status.hw)
2398                         continue;
2399
2400                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2401                 /* XXX handle errors */
2402                 if (r) {
2403                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2404                                   adev->ip_blocks[i].version->funcs->name, r);
2405                 }
2406
2407                 adev->ip_blocks[i].status.hw = false;
2408         }
2409
2410
2411         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2412                 if (!adev->ip_blocks[i].status.sw)
2413                         continue;
2414
2415                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2416                         amdgpu_ucode_free_bo(adev);
2417                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2418                         amdgpu_device_wb_fini(adev);
2419                         amdgpu_device_vram_scratch_fini(adev);
2420                         amdgpu_ib_pool_fini(adev);
2421                 }
2422
2423                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2424                 /* XXX handle errors */
2425                 if (r) {
2426                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2427                                   adev->ip_blocks[i].version->funcs->name, r);
2428                 }
2429                 adev->ip_blocks[i].status.sw = false;
2430                 adev->ip_blocks[i].status.valid = false;
2431         }
2432
2433         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2434                 if (!adev->ip_blocks[i].status.late_initialized)
2435                         continue;
2436                 if (adev->ip_blocks[i].version->funcs->late_fini)
2437                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2438                 adev->ip_blocks[i].status.late_initialized = false;
2439         }
2440
2441         amdgpu_ras_fini(adev);
2442
2443         if (amdgpu_sriov_vf(adev))
2444                 if (amdgpu_virt_release_full_gpu(adev, false))
2445                         DRM_ERROR("failed to release exclusive mode on fini\n");
2446
2447         return 0;
2448 }
2449
2450 /**
2451  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2452  *
2453  * @work: work_struct.
2454  */
2455 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2456 {
2457         struct amdgpu_device *adev =
2458                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2459         int r;
2460
2461         r = amdgpu_ib_ring_tests(adev);
2462         if (r)
2463                 DRM_ERROR("ib ring test failed (%d).\n", r);
2464 }
2465
2466 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2467 {
2468         struct amdgpu_device *adev =
2469                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2470
2471         mutex_lock(&adev->gfx.gfx_off_mutex);
2472         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2473                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2474                         adev->gfx.gfx_off_state = true;
2475         }
2476         mutex_unlock(&adev->gfx.gfx_off_mutex);
2477 }
2478
2479 /**
2480  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2481  *
2482  * @adev: amdgpu_device pointer
2483  *
2484  * Main suspend function for hardware IPs.  The list of all the hardware
2485  * IPs that make up the asic is walked, clockgating is disabled and the
2486  * suspend callbacks are run.  suspend puts the hardware and software state
2487  * in each IP into a state suitable for suspend.
2488  * Returns 0 on success, negative error code on failure.
2489  */
2490 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2491 {
2492         int i, r;
2493
2494         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2495         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2496
2497         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2498                 if (!adev->ip_blocks[i].status.valid)
2499                         continue;
2500
2501                 /* displays are handled separately */
2502                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2503                         continue;
2504
2505                 /* XXX handle errors */
2506                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2507                 /* XXX handle errors */
2508                 if (r) {
2509                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2510                                   adev->ip_blocks[i].version->funcs->name, r);
2511                         return r;
2512                 }
2513
2514                 adev->ip_blocks[i].status.hw = false;
2515         }
2516
2517         return 0;
2518 }
2519
2520 /**
2521  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2522  *
2523  * @adev: amdgpu_device pointer
2524  *
2525  * Main suspend function for hardware IPs.  The list of all the hardware
2526  * IPs that make up the asic is walked, clockgating is disabled and the
2527  * suspend callbacks are run.  suspend puts the hardware and software state
2528  * in each IP into a state suitable for suspend.
2529  * Returns 0 on success, negative error code on failure.
2530  */
2531 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2532 {
2533         int i, r;
2534
2535         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2536                 if (!adev->ip_blocks[i].status.valid)
2537                         continue;
2538                 /* displays are handled in phase1 */
2539                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2540                         continue;
2541                 /* PSP lost connection when err_event_athub occurs */
2542                 if (amdgpu_ras_intr_triggered() &&
2543                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2544                         adev->ip_blocks[i].status.hw = false;
2545                         continue;
2546                 }
2547                 /* XXX handle errors */
2548                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2549                 /* XXX handle errors */
2550                 if (r) {
2551                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2552                                   adev->ip_blocks[i].version->funcs->name, r);
2553                 }
2554                 adev->ip_blocks[i].status.hw = false;
2555                 /* handle putting the SMC in the appropriate state */
2556                 if(!amdgpu_sriov_vf(adev)){
2557                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2558                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2559                                 if (r) {
2560                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2561                                                         adev->mp1_state, r);
2562                                         return r;
2563                                 }
2564                         }
2565                 }
2566                 adev->ip_blocks[i].status.hw = false;
2567         }
2568
2569         return 0;
2570 }
2571
2572 /**
2573  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2574  *
2575  * @adev: amdgpu_device pointer
2576  *
2577  * Main suspend function for hardware IPs.  The list of all the hardware
2578  * IPs that make up the asic is walked, clockgating is disabled and the
2579  * suspend callbacks are run.  suspend puts the hardware and software state
2580  * in each IP into a state suitable for suspend.
2581  * Returns 0 on success, negative error code on failure.
2582  */
2583 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2584 {
2585         int r;
2586
2587         if (amdgpu_sriov_vf(adev))
2588                 amdgpu_virt_request_full_gpu(adev, false);
2589
2590         r = amdgpu_device_ip_suspend_phase1(adev);
2591         if (r)
2592                 return r;
2593         r = amdgpu_device_ip_suspend_phase2(adev);
2594
2595         if (amdgpu_sriov_vf(adev))
2596                 amdgpu_virt_release_full_gpu(adev, false);
2597
2598         return r;
2599 }
2600
2601 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2602 {
2603         int i, r;
2604
2605         static enum amd_ip_block_type ip_order[] = {
2606                 AMD_IP_BLOCK_TYPE_GMC,
2607                 AMD_IP_BLOCK_TYPE_COMMON,
2608                 AMD_IP_BLOCK_TYPE_PSP,
2609                 AMD_IP_BLOCK_TYPE_IH,
2610         };
2611
2612         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2613                 int j;
2614                 struct amdgpu_ip_block *block;
2615
2616                 block = &adev->ip_blocks[i];
2617                 block->status.hw = false;
2618
2619                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2620
2621                         if (block->version->type != ip_order[j] ||
2622                                 !block->status.valid)
2623                                 continue;
2624
2625                         r = block->version->funcs->hw_init(adev);
2626                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2627                         if (r)
2628                                 return r;
2629                         block->status.hw = true;
2630                 }
2631         }
2632
2633         return 0;
2634 }
2635
2636 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2637 {
2638         int i, r;
2639
2640         static enum amd_ip_block_type ip_order[] = {
2641                 AMD_IP_BLOCK_TYPE_SMC,
2642                 AMD_IP_BLOCK_TYPE_DCE,
2643                 AMD_IP_BLOCK_TYPE_GFX,
2644                 AMD_IP_BLOCK_TYPE_SDMA,
2645                 AMD_IP_BLOCK_TYPE_UVD,
2646                 AMD_IP_BLOCK_TYPE_VCE,
2647                 AMD_IP_BLOCK_TYPE_VCN
2648         };
2649
2650         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2651                 int j;
2652                 struct amdgpu_ip_block *block;
2653
2654                 for (j = 0; j < adev->num_ip_blocks; j++) {
2655                         block = &adev->ip_blocks[j];
2656
2657                         if (block->version->type != ip_order[i] ||
2658                                 !block->status.valid ||
2659                                 block->status.hw)
2660                                 continue;
2661
2662                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2663                                 r = block->version->funcs->resume(adev);
2664                         else
2665                                 r = block->version->funcs->hw_init(adev);
2666
2667                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2668                         if (r)
2669                                 return r;
2670                         block->status.hw = true;
2671                 }
2672         }
2673
2674         return 0;
2675 }
2676
2677 /**
2678  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2679  *
2680  * @adev: amdgpu_device pointer
2681  *
2682  * First resume function for hardware IPs.  The list of all the hardware
2683  * IPs that make up the asic is walked and the resume callbacks are run for
2684  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2685  * after a suspend and updates the software state as necessary.  This
2686  * function is also used for restoring the GPU after a GPU reset.
2687  * Returns 0 on success, negative error code on failure.
2688  */
2689 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2690 {
2691         int i, r;
2692
2693         for (i = 0; i < adev->num_ip_blocks; i++) {
2694                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2695                         continue;
2696                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2697                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2698                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2699
2700                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2701                         if (r) {
2702                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2703                                           adev->ip_blocks[i].version->funcs->name, r);
2704                                 return r;
2705                         }
2706                         adev->ip_blocks[i].status.hw = true;
2707                 }
2708         }
2709
2710         return 0;
2711 }
2712
2713 /**
2714  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2715  *
2716  * @adev: amdgpu_device pointer
2717  *
2718  * First resume function for hardware IPs.  The list of all the hardware
2719  * IPs that make up the asic is walked and the resume callbacks are run for
2720  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2721  * functional state after a suspend and updates the software state as
2722  * necessary.  This function is also used for restoring the GPU after a GPU
2723  * reset.
2724  * Returns 0 on success, negative error code on failure.
2725  */
2726 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2727 {
2728         int i, r;
2729
2730         for (i = 0; i < adev->num_ip_blocks; i++) {
2731                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2732                         continue;
2733                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2734                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2735                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2736                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2737                         continue;
2738                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2739                 if (r) {
2740                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2741                                   adev->ip_blocks[i].version->funcs->name, r);
2742                         return r;
2743                 }
2744                 adev->ip_blocks[i].status.hw = true;
2745         }
2746
2747         return 0;
2748 }
2749
2750 /**
2751  * amdgpu_device_ip_resume - run resume for hardware IPs
2752  *
2753  * @adev: amdgpu_device pointer
2754  *
2755  * Main resume function for hardware IPs.  The hardware IPs
2756  * are split into two resume functions because they are
2757  * are also used in in recovering from a GPU reset and some additional
2758  * steps need to be take between them.  In this case (S3/S4) they are
2759  * run sequentially.
2760  * Returns 0 on success, negative error code on failure.
2761  */
2762 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2763 {
2764         int r;
2765
2766         r = amdgpu_device_ip_resume_phase1(adev);
2767         if (r)
2768                 return r;
2769
2770         r = amdgpu_device_fw_loading(adev);
2771         if (r)
2772                 return r;
2773
2774         r = amdgpu_device_ip_resume_phase2(adev);
2775
2776         return r;
2777 }
2778
2779 /**
2780  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2781  *
2782  * @adev: amdgpu_device pointer
2783  *
2784  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2785  */
2786 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2787 {
2788         if (amdgpu_sriov_vf(adev)) {
2789                 if (adev->is_atom_fw) {
2790                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2791                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2792                 } else {
2793                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2794                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2795                 }
2796
2797                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2798                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2799         }
2800 }
2801
2802 /**
2803  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2804  *
2805  * @asic_type: AMD asic type
2806  *
2807  * Check if there is DC (new modesetting infrastructre) support for an asic.
2808  * returns true if DC has support, false if not.
2809  */
2810 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2811 {
2812         switch (asic_type) {
2813 #if defined(CONFIG_DRM_AMD_DC)
2814 #if defined(CONFIG_DRM_AMD_DC_SI)
2815         case CHIP_TAHITI:
2816         case CHIP_PITCAIRN:
2817         case CHIP_VERDE:
2818         case CHIP_OLAND:
2819 #endif
2820         case CHIP_BONAIRE:
2821         case CHIP_KAVERI:
2822         case CHIP_KABINI:
2823         case CHIP_MULLINS:
2824                 /*
2825                  * We have systems in the wild with these ASICs that require
2826                  * LVDS and VGA support which is not supported with DC.
2827                  *
2828                  * Fallback to the non-DC driver here by default so as not to
2829                  * cause regressions.
2830                  */
2831                 return amdgpu_dc > 0;
2832         case CHIP_HAWAII:
2833         case CHIP_CARRIZO:
2834         case CHIP_STONEY:
2835         case CHIP_POLARIS10:
2836         case CHIP_POLARIS11:
2837         case CHIP_POLARIS12:
2838         case CHIP_VEGAM:
2839         case CHIP_TONGA:
2840         case CHIP_FIJI:
2841         case CHIP_VEGA10:
2842         case CHIP_VEGA12:
2843         case CHIP_VEGA20:
2844 #if defined(CONFIG_DRM_AMD_DC_DCN)
2845         case CHIP_RAVEN:
2846         case CHIP_NAVI10:
2847         case CHIP_NAVI14:
2848         case CHIP_NAVI12:
2849         case CHIP_RENOIR:
2850 #endif
2851 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2852         case CHIP_SIENNA_CICHLID:
2853         case CHIP_NAVY_FLOUNDER:
2854 #endif
2855                 return amdgpu_dc != 0;
2856 #endif
2857         default:
2858                 if (amdgpu_dc > 0)
2859                         DRM_INFO("Display Core has been requested via kernel parameter "
2860                                          "but isn't supported by ASIC, ignoring\n");
2861                 return false;
2862         }
2863 }
2864
2865 /**
2866  * amdgpu_device_has_dc_support - check if dc is supported
2867  *
2868  * @adev: amdgpu_device_pointer
2869  *
2870  * Returns true for supported, false for not supported
2871  */
2872 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2873 {
2874         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2875                 return false;
2876
2877         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2878 }
2879
2880
2881 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2882 {
2883         struct amdgpu_device *adev =
2884                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2885         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2886
2887         /* It's a bug to not have a hive within this function */
2888         if (WARN_ON(!hive))
2889                 return;
2890
2891         /*
2892          * Use task barrier to synchronize all xgmi reset works across the
2893          * hive. task_barrier_enter and task_barrier_exit will block
2894          * until all the threads running the xgmi reset works reach
2895          * those points. task_barrier_full will do both blocks.
2896          */
2897         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2898
2899                 task_barrier_enter(&hive->tb);
2900                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
2901
2902                 if (adev->asic_reset_res)
2903                         goto fail;
2904
2905                 task_barrier_exit(&hive->tb);
2906                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
2907
2908                 if (adev->asic_reset_res)
2909                         goto fail;
2910
2911                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2912                         adev->mmhub.funcs->reset_ras_error_count(adev);
2913         } else {
2914
2915                 task_barrier_full(&hive->tb);
2916                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2917         }
2918
2919 fail:
2920         if (adev->asic_reset_res)
2921                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2922                          adev->asic_reset_res, adev_to_drm(adev)->unique);
2923         amdgpu_put_xgmi_hive(hive);
2924 }
2925
2926 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2927 {
2928         char *input = amdgpu_lockup_timeout;
2929         char *timeout_setting = NULL;
2930         int index = 0;
2931         long timeout;
2932         int ret = 0;
2933
2934         /*
2935          * By default timeout for non compute jobs is 10000.
2936          * And there is no timeout enforced on compute jobs.
2937          * In SR-IOV or passthrough mode, timeout for compute
2938          * jobs are 60000 by default.
2939          */
2940         adev->gfx_timeout = msecs_to_jiffies(10000);
2941         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2942         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2943                 adev->compute_timeout =  msecs_to_jiffies(60000);
2944         else
2945                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2946
2947         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2948                 while ((timeout_setting = strsep(&input, ",")) &&
2949                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2950                         ret = kstrtol(timeout_setting, 0, &timeout);
2951                         if (ret)
2952                                 return ret;
2953
2954                         if (timeout == 0) {
2955                                 index++;
2956                                 continue;
2957                         } else if (timeout < 0) {
2958                                 timeout = MAX_SCHEDULE_TIMEOUT;
2959                         } else {
2960                                 timeout = msecs_to_jiffies(timeout);
2961                         }
2962
2963                         switch (index++) {
2964                         case 0:
2965                                 adev->gfx_timeout = timeout;
2966                                 break;
2967                         case 1:
2968                                 adev->compute_timeout = timeout;
2969                                 break;
2970                         case 2:
2971                                 adev->sdma_timeout = timeout;
2972                                 break;
2973                         case 3:
2974                                 adev->video_timeout = timeout;
2975                                 break;
2976                         default:
2977                                 break;
2978                         }
2979                 }
2980                 /*
2981                  * There is only one value specified and
2982                  * it should apply to all non-compute jobs.
2983                  */
2984                 if (index == 1) {
2985                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2986                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2987                                 adev->compute_timeout = adev->gfx_timeout;
2988                 }
2989         }
2990
2991         return ret;
2992 }
2993
2994 static const struct attribute *amdgpu_dev_attributes[] = {
2995         &dev_attr_product_name.attr,
2996         &dev_attr_product_number.attr,
2997         &dev_attr_serial_number.attr,
2998         &dev_attr_pcie_replay_count.attr,
2999         NULL
3000 };
3001
3002 /**
3003  * amdgpu_device_init - initialize the driver
3004  *
3005  * @adev: amdgpu_device pointer
3006  * @flags: driver flags
3007  *
3008  * Initializes the driver info and hw (all asics).
3009  * Returns 0 for success or an error on failure.
3010  * Called at driver startup.
3011  */
3012 int amdgpu_device_init(struct amdgpu_device *adev,
3013                        uint32_t flags)
3014 {
3015         struct drm_device *ddev = adev_to_drm(adev);
3016         struct pci_dev *pdev = adev->pdev;
3017         int r, i;
3018         bool boco = false;
3019         u32 max_MBps;
3020
3021         adev->shutdown = false;
3022         adev->flags = flags;
3023
3024         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3025                 adev->asic_type = amdgpu_force_asic_type;
3026         else
3027                 adev->asic_type = flags & AMD_ASIC_MASK;
3028
3029         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3030         if (amdgpu_emu_mode == 1)
3031                 adev->usec_timeout *= 10;
3032         adev->gmc.gart_size = 512 * 1024 * 1024;
3033         adev->accel_working = false;
3034         adev->num_rings = 0;
3035         adev->mman.buffer_funcs = NULL;
3036         adev->mman.buffer_funcs_ring = NULL;
3037         adev->vm_manager.vm_pte_funcs = NULL;
3038         adev->vm_manager.vm_pte_num_scheds = 0;
3039         adev->gmc.gmc_funcs = NULL;
3040         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3041         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3042
3043         adev->smc_rreg = &amdgpu_invalid_rreg;
3044         adev->smc_wreg = &amdgpu_invalid_wreg;
3045         adev->pcie_rreg = &amdgpu_invalid_rreg;
3046         adev->pcie_wreg = &amdgpu_invalid_wreg;
3047         adev->pciep_rreg = &amdgpu_invalid_rreg;
3048         adev->pciep_wreg = &amdgpu_invalid_wreg;
3049         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3050         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3051         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3052         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3053         adev->didt_rreg = &amdgpu_invalid_rreg;
3054         adev->didt_wreg = &amdgpu_invalid_wreg;
3055         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3056         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3057         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3058         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3059
3060         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3061                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3062                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3063
3064         /* mutex initialization are all done here so we
3065          * can recall function without having locking issues */
3066         atomic_set(&adev->irq.ih.lock, 0);
3067         mutex_init(&adev->firmware.mutex);
3068         mutex_init(&adev->pm.mutex);
3069         mutex_init(&adev->gfx.gpu_clock_mutex);
3070         mutex_init(&adev->srbm_mutex);
3071         mutex_init(&adev->gfx.pipe_reserve_mutex);
3072         mutex_init(&adev->gfx.gfx_off_mutex);
3073         mutex_init(&adev->grbm_idx_mutex);
3074         mutex_init(&adev->mn_lock);
3075         mutex_init(&adev->virt.vf_errors.lock);
3076         hash_init(adev->mn_hash);
3077         atomic_set(&adev->in_gpu_reset, 0);
3078         init_rwsem(&adev->reset_sem);
3079         mutex_init(&adev->psp.mutex);
3080         mutex_init(&adev->notifier_lock);
3081
3082         r = amdgpu_device_check_arguments(adev);
3083         if (r)
3084                 return r;
3085
3086         spin_lock_init(&adev->mmio_idx_lock);
3087         spin_lock_init(&adev->smc_idx_lock);
3088         spin_lock_init(&adev->pcie_idx_lock);
3089         spin_lock_init(&adev->uvd_ctx_idx_lock);
3090         spin_lock_init(&adev->didt_idx_lock);
3091         spin_lock_init(&adev->gc_cac_idx_lock);
3092         spin_lock_init(&adev->se_cac_idx_lock);
3093         spin_lock_init(&adev->audio_endpt_idx_lock);
3094         spin_lock_init(&adev->mm_stats.lock);
3095
3096         INIT_LIST_HEAD(&adev->shadow_list);
3097         mutex_init(&adev->shadow_list_lock);
3098
3099         INIT_DELAYED_WORK(&adev->delayed_init_work,
3100                           amdgpu_device_delayed_init_work_handler);
3101         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3102                           amdgpu_device_delay_enable_gfx_off);
3103
3104         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3105
3106         adev->gfx.gfx_off_req_count = 1;
3107         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3108
3109         atomic_set(&adev->throttling_logging_enabled, 1);
3110         /*
3111          * If throttling continues, logging will be performed every minute
3112          * to avoid log flooding. "-1" is subtracted since the thermal
3113          * throttling interrupt comes every second. Thus, the total logging
3114          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3115          * for throttling interrupt) = 60 seconds.
3116          */
3117         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3118         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3119
3120         /* Registers mapping */
3121         /* TODO: block userspace mapping of io register */
3122         if (adev->asic_type >= CHIP_BONAIRE) {
3123                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3124                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3125         } else {
3126                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3127                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3128         }
3129
3130         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3131         if (adev->rmmio == NULL) {
3132                 return -ENOMEM;
3133         }
3134         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3135         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3136
3137         /* io port mapping */
3138         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3139                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3140                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3141                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3142                         break;
3143                 }
3144         }
3145         if (adev->rio_mem == NULL)
3146                 DRM_INFO("PCI I/O BAR is not found.\n");
3147
3148         /* enable PCIE atomic ops */
3149         r = pci_enable_atomic_ops_to_root(adev->pdev,
3150                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3151                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3152         if (r) {
3153                 adev->have_atomics_support = false;
3154                 DRM_INFO("PCIE atomic ops is not supported\n");
3155         } else {
3156                 adev->have_atomics_support = true;
3157         }
3158
3159         amdgpu_device_get_pcie_info(adev);
3160
3161         if (amdgpu_mcbp)
3162                 DRM_INFO("MCBP is enabled\n");
3163
3164         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3165                 adev->enable_mes = true;
3166
3167         /* detect hw virtualization here */
3168         amdgpu_detect_virtualization(adev);
3169
3170         r = amdgpu_device_get_job_timeout_settings(adev);
3171         if (r) {
3172                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3173                 return r;
3174         }
3175
3176         /* early init functions */
3177         r = amdgpu_device_ip_early_init(adev);
3178         if (r)
3179                 return r;
3180
3181         /* doorbell bar mapping and doorbell index init*/
3182         amdgpu_device_doorbell_init(adev);
3183
3184         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3185         /* this will fail for cards that aren't VGA class devices, just
3186          * ignore it */
3187         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3188
3189         if (amdgpu_device_supports_boco(ddev))
3190                 boco = true;
3191         if (amdgpu_has_atpx() &&
3192             (amdgpu_is_atpx_hybrid() ||
3193              amdgpu_has_atpx_dgpu_power_cntl()) &&
3194             !pci_is_thunderbolt_attached(adev->pdev))
3195                 vga_switcheroo_register_client(adev->pdev,
3196                                                &amdgpu_switcheroo_ops, boco);
3197         if (boco)
3198                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3199
3200         if (amdgpu_emu_mode == 1) {
3201                 /* post the asic on emulation mode */
3202                 emu_soc_asic_init(adev);
3203                 goto fence_driver_init;
3204         }
3205
3206         /* detect if we are with an SRIOV vbios */
3207         amdgpu_device_detect_sriov_bios(adev);
3208
3209         /* check if we need to reset the asic
3210          *  E.g., driver was not cleanly unloaded previously, etc.
3211          */
3212         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3213                 r = amdgpu_asic_reset(adev);
3214                 if (r) {
3215                         dev_err(adev->dev, "asic reset on init failed\n");
3216                         goto failed;
3217                 }
3218         }
3219
3220         /* Post card if necessary */
3221         if (amdgpu_device_need_post(adev)) {
3222                 if (!adev->bios) {
3223                         dev_err(adev->dev, "no vBIOS found\n");
3224                         r = -EINVAL;
3225                         goto failed;
3226                 }
3227                 DRM_INFO("GPU posting now...\n");
3228                 r = amdgpu_device_asic_init(adev);
3229                 if (r) {
3230                         dev_err(adev->dev, "gpu post error!\n");
3231                         goto failed;
3232                 }
3233         }
3234
3235         if (adev->is_atom_fw) {
3236                 /* Initialize clocks */
3237                 r = amdgpu_atomfirmware_get_clock_info(adev);
3238                 if (r) {
3239                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3240                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3241                         goto failed;
3242                 }
3243         } else {
3244                 /* Initialize clocks */
3245                 r = amdgpu_atombios_get_clock_info(adev);
3246                 if (r) {
3247                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3248                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3249                         goto failed;
3250                 }
3251                 /* init i2c buses */
3252                 if (!amdgpu_device_has_dc_support(adev))
3253                         amdgpu_atombios_i2c_init(adev);
3254         }
3255
3256 fence_driver_init:
3257         /* Fence driver */
3258         r = amdgpu_fence_driver_init(adev);
3259         if (r) {
3260                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3261                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3262                 goto failed;
3263         }
3264
3265         /* init the mode config */
3266         drm_mode_config_init(adev_to_drm(adev));
3267
3268         r = amdgpu_device_ip_init(adev);
3269         if (r) {
3270                 /* failed in exclusive mode due to timeout */
3271                 if (amdgpu_sriov_vf(adev) &&
3272                     !amdgpu_sriov_runtime(adev) &&
3273                     amdgpu_virt_mmio_blocked(adev) &&
3274                     !amdgpu_virt_wait_reset(adev)) {
3275                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3276                         /* Don't send request since VF is inactive. */
3277                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3278                         adev->virt.ops = NULL;
3279                         r = -EAGAIN;
3280                         goto failed;
3281                 }
3282                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3283                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3284                 goto failed;
3285         }
3286
3287         dev_info(adev->dev,
3288                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3289                         adev->gfx.config.max_shader_engines,
3290                         adev->gfx.config.max_sh_per_se,
3291                         adev->gfx.config.max_cu_per_sh,
3292                         adev->gfx.cu_info.number);
3293
3294         adev->accel_working = true;
3295
3296         amdgpu_vm_check_compute_bug(adev);
3297
3298         /* Initialize the buffer migration limit. */
3299         if (amdgpu_moverate >= 0)
3300                 max_MBps = amdgpu_moverate;
3301         else
3302                 max_MBps = 8; /* Allow 8 MB/s. */
3303         /* Get a log2 for easy divisions. */
3304         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3305
3306         amdgpu_fbdev_init(adev);
3307
3308         r = amdgpu_pm_sysfs_init(adev);
3309         if (r) {
3310                 adev->pm_sysfs_en = false;
3311                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3312         } else
3313                 adev->pm_sysfs_en = true;
3314
3315         r = amdgpu_ucode_sysfs_init(adev);
3316         if (r) {
3317                 adev->ucode_sysfs_en = false;
3318                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3319         } else
3320                 adev->ucode_sysfs_en = true;
3321
3322         if ((amdgpu_testing & 1)) {
3323                 if (adev->accel_working)
3324                         amdgpu_test_moves(adev);
3325                 else
3326                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3327         }
3328         if (amdgpu_benchmarking) {
3329                 if (adev->accel_working)
3330                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3331                 else
3332                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3333         }
3334
3335         /*
3336          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3337          * Otherwise the mgpu fan boost feature will be skipped due to the
3338          * gpu instance is counted less.
3339          */
3340         amdgpu_register_gpu_instance(adev);
3341
3342         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3343          * explicit gating rather than handling it automatically.
3344          */
3345         r = amdgpu_device_ip_late_init(adev);
3346         if (r) {
3347                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3348                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3349                 goto failed;
3350         }
3351
3352         /* must succeed. */
3353         amdgpu_ras_resume(adev);
3354
3355         queue_delayed_work(system_wq, &adev->delayed_init_work,
3356                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3357
3358         if (amdgpu_sriov_vf(adev))
3359                 flush_delayed_work(&adev->delayed_init_work);
3360
3361         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3362         if (r) {
3363                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3364                 return r;
3365         }
3366
3367         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3368                 r = amdgpu_pmu_init(adev);
3369         if (r)
3370                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3371
3372         return 0;
3373
3374 failed:
3375         amdgpu_vf_error_trans_all(adev);
3376         if (boco)
3377                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3378
3379         return r;
3380 }
3381
3382 /**
3383  * amdgpu_device_fini - tear down the driver
3384  *
3385  * @adev: amdgpu_device pointer
3386  *
3387  * Tear down the driver info (all asics).
3388  * Called at driver shutdown.
3389  */
3390 void amdgpu_device_fini(struct amdgpu_device *adev)
3391 {
3392         dev_info(adev->dev, "amdgpu: finishing device.\n");
3393         flush_delayed_work(&adev->delayed_init_work);
3394         adev->shutdown = true;
3395
3396         /* make sure IB test finished before entering exclusive mode
3397          * to avoid preemption on IB test
3398          * */
3399         if (amdgpu_sriov_vf(adev))
3400                 amdgpu_virt_request_full_gpu(adev, false);
3401
3402         /* disable all interrupts */
3403         amdgpu_irq_disable_all(adev);
3404         if (adev->mode_info.mode_config_initialized){
3405                 if (!amdgpu_device_has_dc_support(adev))
3406                         drm_helper_force_disable_all(adev_to_drm(adev));
3407                 else
3408                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3409         }
3410         amdgpu_fence_driver_fini(adev);
3411         if (adev->pm_sysfs_en)
3412                 amdgpu_pm_sysfs_fini(adev);
3413         amdgpu_fbdev_fini(adev);
3414         amdgpu_device_ip_fini(adev);
3415         release_firmware(adev->firmware.gpu_info_fw);
3416         adev->firmware.gpu_info_fw = NULL;
3417         adev->accel_working = false;
3418         /* free i2c buses */
3419         if (!amdgpu_device_has_dc_support(adev))
3420                 amdgpu_i2c_fini(adev);
3421
3422         if (amdgpu_emu_mode != 1)
3423                 amdgpu_atombios_fini(adev);
3424
3425         kfree(adev->bios);
3426         adev->bios = NULL;
3427         if (amdgpu_has_atpx() &&
3428             (amdgpu_is_atpx_hybrid() ||
3429              amdgpu_has_atpx_dgpu_power_cntl()) &&
3430             !pci_is_thunderbolt_attached(adev->pdev))
3431                 vga_switcheroo_unregister_client(adev->pdev);
3432         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3433                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3434         vga_client_register(adev->pdev, NULL, NULL, NULL);
3435         if (adev->rio_mem)
3436                 pci_iounmap(adev->pdev, adev->rio_mem);
3437         adev->rio_mem = NULL;
3438         iounmap(adev->rmmio);
3439         adev->rmmio = NULL;
3440         amdgpu_device_doorbell_fini(adev);
3441
3442         if (adev->ucode_sysfs_en)
3443                 amdgpu_ucode_sysfs_fini(adev);
3444
3445         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3446         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3447                 amdgpu_pmu_fini(adev);
3448         if (adev->mman.discovery_bin)
3449                 amdgpu_discovery_fini(adev);
3450 }
3451
3452
3453 /*
3454  * Suspend & resume.
3455  */
3456 /**
3457  * amdgpu_device_suspend - initiate device suspend
3458  *
3459  * @dev: drm dev pointer
3460  * @fbcon : notify the fbdev of suspend
3461  *
3462  * Puts the hw in the suspend state (all asics).
3463  * Returns 0 for success or an error on failure.
3464  * Called at driver suspend.
3465  */
3466 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3467 {
3468         struct amdgpu_device *adev;
3469         struct drm_crtc *crtc;
3470         struct drm_connector *connector;
3471         struct drm_connector_list_iter iter;
3472         int r;
3473
3474         adev = drm_to_adev(dev);
3475
3476         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3477                 return 0;
3478
3479         adev->in_suspend = true;
3480         drm_kms_helper_poll_disable(dev);
3481
3482         if (fbcon)
3483                 amdgpu_fbdev_set_suspend(adev, 1);
3484
3485         cancel_delayed_work_sync(&adev->delayed_init_work);
3486
3487         if (!amdgpu_device_has_dc_support(adev)) {
3488                 /* turn off display hw */
3489                 drm_modeset_lock_all(dev);
3490                 drm_connector_list_iter_begin(dev, &iter);
3491                 drm_for_each_connector_iter(connector, &iter)
3492                         drm_helper_connector_dpms(connector,
3493                                                   DRM_MODE_DPMS_OFF);
3494                 drm_connector_list_iter_end(&iter);
3495                 drm_modeset_unlock_all(dev);
3496                         /* unpin the front buffers and cursors */
3497                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3498                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3499                         struct drm_framebuffer *fb = crtc->primary->fb;
3500                         struct amdgpu_bo *robj;
3501
3502                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3503                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3504                                 r = amdgpu_bo_reserve(aobj, true);
3505                                 if (r == 0) {
3506                                         amdgpu_bo_unpin(aobj);
3507                                         amdgpu_bo_unreserve(aobj);
3508                                 }
3509                         }
3510
3511                         if (fb == NULL || fb->obj[0] == NULL) {
3512                                 continue;
3513                         }
3514                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3515                         /* don't unpin kernel fb objects */
3516                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3517                                 r = amdgpu_bo_reserve(robj, true);
3518                                 if (r == 0) {
3519                                         amdgpu_bo_unpin(robj);
3520                                         amdgpu_bo_unreserve(robj);
3521                                 }
3522                         }
3523                 }
3524         }
3525
3526         amdgpu_ras_suspend(adev);
3527
3528         r = amdgpu_device_ip_suspend_phase1(adev);
3529
3530         amdgpu_amdkfd_suspend(adev, !fbcon);
3531
3532         /* evict vram memory */
3533         amdgpu_bo_evict_vram(adev);
3534
3535         amdgpu_fence_driver_suspend(adev);
3536
3537         r = amdgpu_device_ip_suspend_phase2(adev);
3538
3539         /* evict remaining vram memory
3540          * This second call to evict vram is to evict the gart page table
3541          * using the CPU.
3542          */
3543         amdgpu_bo_evict_vram(adev);
3544
3545         return 0;
3546 }
3547
3548 /**
3549  * amdgpu_device_resume - initiate device resume
3550  *
3551  * @dev: drm dev pointer
3552  * @fbcon : notify the fbdev of resume
3553  *
3554  * Bring the hw back to operating state (all asics).
3555  * Returns 0 for success or an error on failure.
3556  * Called at driver resume.
3557  */
3558 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3559 {
3560         struct drm_connector *connector;
3561         struct drm_connector_list_iter iter;
3562         struct amdgpu_device *adev = drm_to_adev(dev);
3563         struct drm_crtc *crtc;
3564         int r = 0;
3565
3566         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3567                 return 0;
3568
3569         /* post card */
3570         if (amdgpu_device_need_post(adev)) {
3571                 r = amdgpu_device_asic_init(adev);
3572                 if (r)
3573                         dev_err(adev->dev, "amdgpu asic init failed\n");
3574         }
3575
3576         r = amdgpu_device_ip_resume(adev);
3577         if (r) {
3578                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3579                 return r;
3580         }
3581         amdgpu_fence_driver_resume(adev);
3582
3583
3584         r = amdgpu_device_ip_late_init(adev);
3585         if (r)
3586                 return r;
3587
3588         queue_delayed_work(system_wq, &adev->delayed_init_work,
3589                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3590
3591         if (!amdgpu_device_has_dc_support(adev)) {
3592                 /* pin cursors */
3593                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3594                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3595
3596                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3597                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3598                                 r = amdgpu_bo_reserve(aobj, true);
3599                                 if (r == 0) {
3600                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3601                                         if (r != 0)
3602                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3603                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3604                                         amdgpu_bo_unreserve(aobj);
3605                                 }
3606                         }
3607                 }
3608         }
3609         r = amdgpu_amdkfd_resume(adev, !fbcon);
3610         if (r)
3611                 return r;
3612
3613         /* Make sure IB tests flushed */
3614         flush_delayed_work(&adev->delayed_init_work);
3615
3616         /* blat the mode back in */
3617         if (fbcon) {
3618                 if (!amdgpu_device_has_dc_support(adev)) {
3619                         /* pre DCE11 */
3620                         drm_helper_resume_force_mode(dev);
3621
3622                         /* turn on display hw */
3623                         drm_modeset_lock_all(dev);
3624
3625                         drm_connector_list_iter_begin(dev, &iter);
3626                         drm_for_each_connector_iter(connector, &iter)
3627                                 drm_helper_connector_dpms(connector,
3628                                                           DRM_MODE_DPMS_ON);
3629                         drm_connector_list_iter_end(&iter);
3630
3631                         drm_modeset_unlock_all(dev);
3632                 }
3633                 amdgpu_fbdev_set_suspend(adev, 0);
3634         }
3635
3636         drm_kms_helper_poll_enable(dev);
3637
3638         amdgpu_ras_resume(adev);
3639
3640         /*
3641          * Most of the connector probing functions try to acquire runtime pm
3642          * refs to ensure that the GPU is powered on when connector polling is
3643          * performed. Since we're calling this from a runtime PM callback,
3644          * trying to acquire rpm refs will cause us to deadlock.
3645          *
3646          * Since we're guaranteed to be holding the rpm lock, it's safe to
3647          * temporarily disable the rpm helpers so this doesn't deadlock us.
3648          */
3649 #ifdef CONFIG_PM
3650         dev->dev->power.disable_depth++;
3651 #endif
3652         if (!amdgpu_device_has_dc_support(adev))
3653                 drm_helper_hpd_irq_event(dev);
3654         else
3655                 drm_kms_helper_hotplug_event(dev);
3656 #ifdef CONFIG_PM
3657         dev->dev->power.disable_depth--;
3658 #endif
3659         adev->in_suspend = false;
3660
3661         return 0;
3662 }
3663
3664 /**
3665  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3666  *
3667  * @adev: amdgpu_device pointer
3668  *
3669  * The list of all the hardware IPs that make up the asic is walked and
3670  * the check_soft_reset callbacks are run.  check_soft_reset determines
3671  * if the asic is still hung or not.
3672  * Returns true if any of the IPs are still in a hung state, false if not.
3673  */
3674 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3675 {
3676         int i;
3677         bool asic_hang = false;
3678
3679         if (amdgpu_sriov_vf(adev))
3680                 return true;
3681
3682         if (amdgpu_asic_need_full_reset(adev))
3683                 return true;
3684
3685         for (i = 0; i < adev->num_ip_blocks; i++) {
3686                 if (!adev->ip_blocks[i].status.valid)
3687                         continue;
3688                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3689                         adev->ip_blocks[i].status.hang =
3690                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3691                 if (adev->ip_blocks[i].status.hang) {
3692                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3693                         asic_hang = true;
3694                 }
3695         }
3696         return asic_hang;
3697 }
3698
3699 /**
3700  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3701  *
3702  * @adev: amdgpu_device pointer
3703  *
3704  * The list of all the hardware IPs that make up the asic is walked and the
3705  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3706  * handles any IP specific hardware or software state changes that are
3707  * necessary for a soft reset to succeed.
3708  * Returns 0 on success, negative error code on failure.
3709  */
3710 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3711 {
3712         int i, r = 0;
3713
3714         for (i = 0; i < adev->num_ip_blocks; i++) {
3715                 if (!adev->ip_blocks[i].status.valid)
3716                         continue;
3717                 if (adev->ip_blocks[i].status.hang &&
3718                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3719                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3720                         if (r)
3721                                 return r;
3722                 }
3723         }
3724
3725         return 0;
3726 }
3727
3728 /**
3729  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3730  *
3731  * @adev: amdgpu_device pointer
3732  *
3733  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3734  * reset is necessary to recover.
3735  * Returns true if a full asic reset is required, false if not.
3736  */
3737 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3738 {
3739         int i;
3740
3741         if (amdgpu_asic_need_full_reset(adev))
3742                 return true;
3743
3744         for (i = 0; i < adev->num_ip_blocks; i++) {
3745                 if (!adev->ip_blocks[i].status.valid)
3746                         continue;
3747                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3748                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3749                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3750                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3751                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3752                         if (adev->ip_blocks[i].status.hang) {
3753                                 dev_info(adev->dev, "Some block need full reset!\n");
3754                                 return true;
3755                         }
3756                 }
3757         }
3758         return false;
3759 }
3760
3761 /**
3762  * amdgpu_device_ip_soft_reset - do a soft reset
3763  *
3764  * @adev: amdgpu_device pointer
3765  *
3766  * The list of all the hardware IPs that make up the asic is walked and the
3767  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3768  * IP specific hardware or software state changes that are necessary to soft
3769  * reset the IP.
3770  * Returns 0 on success, negative error code on failure.
3771  */
3772 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3773 {
3774         int i, r = 0;
3775
3776         for (i = 0; i < adev->num_ip_blocks; i++) {
3777                 if (!adev->ip_blocks[i].status.valid)
3778                         continue;
3779                 if (adev->ip_blocks[i].status.hang &&
3780                     adev->ip_blocks[i].version->funcs->soft_reset) {
3781                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3782                         if (r)
3783                                 return r;
3784                 }
3785         }
3786
3787         return 0;
3788 }
3789
3790 /**
3791  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3792  *
3793  * @adev: amdgpu_device pointer
3794  *
3795  * The list of all the hardware IPs that make up the asic is walked and the
3796  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3797  * handles any IP specific hardware or software state changes that are
3798  * necessary after the IP has been soft reset.
3799  * Returns 0 on success, negative error code on failure.
3800  */
3801 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3802 {
3803         int i, r = 0;
3804
3805         for (i = 0; i < adev->num_ip_blocks; i++) {
3806                 if (!adev->ip_blocks[i].status.valid)
3807                         continue;
3808                 if (adev->ip_blocks[i].status.hang &&
3809                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3810                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3811                 if (r)
3812                         return r;
3813         }
3814
3815         return 0;
3816 }
3817
3818 /**
3819  * amdgpu_device_recover_vram - Recover some VRAM contents
3820  *
3821  * @adev: amdgpu_device pointer
3822  *
3823  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3824  * restore things like GPUVM page tables after a GPU reset where
3825  * the contents of VRAM might be lost.
3826  *
3827  * Returns:
3828  * 0 on success, negative error code on failure.
3829  */
3830 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3831 {
3832         struct dma_fence *fence = NULL, *next = NULL;
3833         struct amdgpu_bo *shadow;
3834         long r = 1, tmo;
3835
3836         if (amdgpu_sriov_runtime(adev))
3837                 tmo = msecs_to_jiffies(8000);
3838         else
3839                 tmo = msecs_to_jiffies(100);
3840
3841         dev_info(adev->dev, "recover vram bo from shadow start\n");
3842         mutex_lock(&adev->shadow_list_lock);
3843         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3844
3845                 /* No need to recover an evicted BO */
3846                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3847                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3848                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3849                         continue;
3850
3851                 r = amdgpu_bo_restore_shadow(shadow, &next);
3852                 if (r)
3853                         break;
3854
3855                 if (fence) {
3856                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3857                         dma_fence_put(fence);
3858                         fence = next;
3859                         if (tmo == 0) {
3860                                 r = -ETIMEDOUT;
3861                                 break;
3862                         } else if (tmo < 0) {
3863                                 r = tmo;
3864                                 break;
3865                         }
3866                 } else {
3867                         fence = next;
3868                 }
3869         }
3870         mutex_unlock(&adev->shadow_list_lock);
3871
3872         if (fence)
3873                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3874         dma_fence_put(fence);
3875
3876         if (r < 0 || tmo <= 0) {
3877                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3878                 return -EIO;
3879         }
3880
3881         dev_info(adev->dev, "recover vram bo from shadow done\n");
3882         return 0;
3883 }
3884
3885
3886 /**
3887  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3888  *
3889  * @adev: amdgpu device pointer
3890  * @from_hypervisor: request from hypervisor
3891  *
3892  * do VF FLR and reinitialize Asic
3893  * return 0 means succeeded otherwise failed
3894  */
3895 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3896                                      bool from_hypervisor)
3897 {
3898         int r;
3899
3900         if (from_hypervisor)
3901                 r = amdgpu_virt_request_full_gpu(adev, true);
3902         else
3903                 r = amdgpu_virt_reset_gpu(adev);
3904         if (r)
3905                 return r;
3906
3907         amdgpu_amdkfd_pre_reset(adev);
3908
3909         /* Resume IP prior to SMC */
3910         r = amdgpu_device_ip_reinit_early_sriov(adev);
3911         if (r)
3912                 goto error;
3913
3914         amdgpu_virt_init_data_exchange(adev);
3915         /* we need recover gart prior to run SMC/CP/SDMA resume */
3916         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
3917
3918         r = amdgpu_device_fw_loading(adev);
3919         if (r)
3920                 return r;
3921
3922         /* now we are okay to resume SMC/CP/SDMA */
3923         r = amdgpu_device_ip_reinit_late_sriov(adev);
3924         if (r)
3925                 goto error;
3926
3927         amdgpu_irq_gpu_reset_resume_helper(adev);
3928         r = amdgpu_ib_ring_tests(adev);
3929         amdgpu_amdkfd_post_reset(adev);
3930
3931 error:
3932         amdgpu_virt_release_full_gpu(adev, true);
3933         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3934                 amdgpu_inc_vram_lost(adev);
3935                 r = amdgpu_device_recover_vram(adev);
3936         }
3937
3938         return r;
3939 }
3940
3941 /**
3942  * amdgpu_device_has_job_running - check if there is any job in mirror list
3943  *
3944  * @adev: amdgpu device pointer
3945  *
3946  * check if there is any job in mirror list
3947  */
3948 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3949 {
3950         int i;
3951         struct drm_sched_job *job;
3952
3953         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3954                 struct amdgpu_ring *ring = adev->rings[i];
3955
3956                 if (!ring || !ring->sched.thread)
3957                         continue;
3958
3959                 spin_lock(&ring->sched.job_list_lock);
3960                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
3961                                 struct drm_sched_job, node);
3962                 spin_unlock(&ring->sched.job_list_lock);
3963                 if (job)
3964                         return true;
3965         }
3966         return false;
3967 }
3968
3969 /**
3970  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3971  *
3972  * @adev: amdgpu device pointer
3973  *
3974  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3975  * a hung GPU.
3976  */
3977 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3978 {
3979         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3980                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
3981                 return false;
3982         }
3983
3984         if (amdgpu_gpu_recovery == 0)
3985                 goto disabled;
3986
3987         if (amdgpu_sriov_vf(adev))
3988                 return true;
3989
3990         if (amdgpu_gpu_recovery == -1) {
3991                 switch (adev->asic_type) {
3992                 case CHIP_BONAIRE:
3993                 case CHIP_HAWAII:
3994                 case CHIP_TOPAZ:
3995                 case CHIP_TONGA:
3996                 case CHIP_FIJI:
3997                 case CHIP_POLARIS10:
3998                 case CHIP_POLARIS11:
3999                 case CHIP_POLARIS12:
4000                 case CHIP_VEGAM:
4001                 case CHIP_VEGA20:
4002                 case CHIP_VEGA10:
4003                 case CHIP_VEGA12:
4004                 case CHIP_RAVEN:
4005                 case CHIP_ARCTURUS:
4006                 case CHIP_RENOIR:
4007                 case CHIP_NAVI10:
4008                 case CHIP_NAVI14:
4009                 case CHIP_NAVI12:
4010                 case CHIP_SIENNA_CICHLID:
4011                         break;
4012                 default:
4013                         goto disabled;
4014                 }
4015         }
4016
4017         return true;
4018
4019 disabled:
4020                 dev_info(adev->dev, "GPU recovery disabled.\n");
4021                 return false;
4022 }
4023
4024
4025 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4026                                         struct amdgpu_job *job,
4027                                         bool *need_full_reset_arg)
4028 {
4029         int i, r = 0;
4030         bool need_full_reset  = *need_full_reset_arg;
4031
4032         amdgpu_debugfs_wait_dump(adev);
4033
4034         /* block all schedulers and reset given job's ring */
4035         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4036                 struct amdgpu_ring *ring = adev->rings[i];
4037
4038                 if (!ring || !ring->sched.thread)
4039                         continue;
4040
4041                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4042                 amdgpu_fence_driver_force_completion(ring);
4043         }
4044
4045         if(job)
4046                 drm_sched_increase_karma(&job->base);
4047
4048         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4049         if (!amdgpu_sriov_vf(adev)) {
4050
4051                 if (!need_full_reset)
4052                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4053
4054                 if (!need_full_reset) {
4055                         amdgpu_device_ip_pre_soft_reset(adev);
4056                         r = amdgpu_device_ip_soft_reset(adev);
4057                         amdgpu_device_ip_post_soft_reset(adev);
4058                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4059                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4060                                 need_full_reset = true;
4061                         }
4062                 }
4063
4064                 if (need_full_reset)
4065                         r = amdgpu_device_ip_suspend(adev);
4066
4067                 *need_full_reset_arg = need_full_reset;
4068         }
4069
4070         return r;
4071 }
4072
4073 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4074                                struct list_head *device_list_handle,
4075                                bool *need_full_reset_arg)
4076 {
4077         struct amdgpu_device *tmp_adev = NULL;
4078         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4079         int r = 0;
4080
4081         /*
4082          * ASIC reset has to be done on all HGMI hive nodes ASAP
4083          * to allow proper links negotiation in FW (within 1 sec)
4084          */
4085         if (need_full_reset) {
4086                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4087                         /* For XGMI run all resets in parallel to speed up the process */
4088                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4089                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4090                                         r = -EALREADY;
4091                         } else
4092                                 r = amdgpu_asic_reset(tmp_adev);
4093
4094                         if (r) {
4095                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4096                                          r, adev_to_drm(tmp_adev)->unique);
4097                                 break;
4098                         }
4099                 }
4100
4101                 /* For XGMI wait for all resets to complete before proceed */
4102                 if (!r) {
4103                         list_for_each_entry(tmp_adev, device_list_handle,
4104                                             gmc.xgmi.head) {
4105                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4106                                         flush_work(&tmp_adev->xgmi_reset_work);
4107                                         r = tmp_adev->asic_reset_res;
4108                                         if (r)
4109                                                 break;
4110                                 }
4111                         }
4112                 }
4113         }
4114
4115         if (!r && amdgpu_ras_intr_triggered()) {
4116                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4117                         if (tmp_adev->mmhub.funcs &&
4118                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4119                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4120                 }
4121
4122                 amdgpu_ras_intr_cleared();
4123         }
4124
4125         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4126                 if (need_full_reset) {
4127                         /* post card */
4128                         if (amdgpu_device_asic_init(tmp_adev))
4129                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4130
4131                         if (!r) {
4132                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4133                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4134                                 if (r)
4135                                         goto out;
4136
4137                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4138                                 if (vram_lost) {
4139                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4140                                         amdgpu_inc_vram_lost(tmp_adev);
4141                                 }
4142
4143                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4144                                 if (r)
4145                                         goto out;
4146
4147                                 r = amdgpu_device_fw_loading(tmp_adev);
4148                                 if (r)
4149                                         return r;
4150
4151                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4152                                 if (r)
4153                                         goto out;
4154
4155                                 if (vram_lost)
4156                                         amdgpu_device_fill_reset_magic(tmp_adev);
4157
4158                                 /*
4159                                  * Add this ASIC as tracked as reset was already
4160                                  * complete successfully.
4161                                  */
4162                                 amdgpu_register_gpu_instance(tmp_adev);
4163
4164                                 r = amdgpu_device_ip_late_init(tmp_adev);
4165                                 if (r)
4166                                         goto out;
4167
4168                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4169
4170                                 /*
4171                                  * The GPU enters bad state once faulty pages
4172                                  * by ECC has reached the threshold, and ras
4173                                  * recovery is scheduled next. So add one check
4174                                  * here to break recovery if it indeed exceeds
4175                                  * bad page threshold, and remind user to
4176                                  * retire this GPU or setting one bigger
4177                                  * bad_page_threshold value to fix this once
4178                                  * probing driver again.
4179                                  */
4180                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4181                                         /* must succeed. */
4182                                         amdgpu_ras_resume(tmp_adev);
4183                                 } else {
4184                                         r = -EINVAL;
4185                                         goto out;
4186                                 }
4187
4188                                 /* Update PSP FW topology after reset */
4189                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4190                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4191                         }
4192                 }
4193
4194 out:
4195                 if (!r) {
4196                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4197                         r = amdgpu_ib_ring_tests(tmp_adev);
4198                         if (r) {
4199                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4200                                 r = amdgpu_device_ip_suspend(tmp_adev);
4201                                 need_full_reset = true;
4202                                 r = -EAGAIN;
4203                                 goto end;
4204                         }
4205                 }
4206
4207                 if (!r)
4208                         r = amdgpu_device_recover_vram(tmp_adev);
4209                 else
4210                         tmp_adev->asic_reset_res = r;
4211         }
4212
4213 end:
4214         *need_full_reset_arg = need_full_reset;
4215         return r;
4216 }
4217
4218 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4219                                 struct amdgpu_hive_info *hive)
4220 {
4221         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4222                 return false;
4223
4224         if (hive) {
4225                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4226         } else {
4227                 down_write(&adev->reset_sem);
4228         }
4229
4230         atomic_inc(&adev->gpu_reset_counter);
4231         switch (amdgpu_asic_reset_method(adev)) {
4232         case AMD_RESET_METHOD_MODE1:
4233                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4234                 break;
4235         case AMD_RESET_METHOD_MODE2:
4236                 adev->mp1_state = PP_MP1_STATE_RESET;
4237                 break;
4238         default:
4239                 adev->mp1_state = PP_MP1_STATE_NONE;
4240                 break;
4241         }
4242
4243         return true;
4244 }
4245
4246 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4247 {
4248         amdgpu_vf_error_trans_all(adev);
4249         adev->mp1_state = PP_MP1_STATE_NONE;
4250         atomic_set(&adev->in_gpu_reset, 0);
4251         up_write(&adev->reset_sem);
4252 }
4253
4254 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4255 {
4256         struct pci_dev *p = NULL;
4257
4258         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4259                         adev->pdev->bus->number, 1);
4260         if (p) {
4261                 pm_runtime_enable(&(p->dev));
4262                 pm_runtime_resume(&(p->dev));
4263         }
4264 }
4265
4266 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4267 {
4268         enum amd_reset_method reset_method;
4269         struct pci_dev *p = NULL;
4270         u64 expires;
4271
4272         /*
4273          * For now, only BACO and mode1 reset are confirmed
4274          * to suffer the audio issue without proper suspended.
4275          */
4276         reset_method = amdgpu_asic_reset_method(adev);
4277         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4278              (reset_method != AMD_RESET_METHOD_MODE1))
4279                 return -EINVAL;
4280
4281         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4282                         adev->pdev->bus->number, 1);
4283         if (!p)
4284                 return -ENODEV;
4285
4286         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4287         if (!expires)
4288                 /*
4289                  * If we cannot get the audio device autosuspend delay,
4290                  * a fixed 4S interval will be used. Considering 3S is
4291                  * the audio controller default autosuspend delay setting.
4292                  * 4S used here is guaranteed to cover that.
4293                  */
4294                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4295
4296         while (!pm_runtime_status_suspended(&(p->dev))) {
4297                 if (!pm_runtime_suspend(&(p->dev)))
4298                         break;
4299
4300                 if (expires < ktime_get_mono_fast_ns()) {
4301                         dev_warn(adev->dev, "failed to suspend display audio\n");
4302                         /* TODO: abort the succeeding gpu reset? */
4303                         return -ETIMEDOUT;
4304                 }
4305         }
4306
4307         pm_runtime_disable(&(p->dev));
4308
4309         return 0;
4310 }
4311
4312 /**
4313  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4314  *
4315  * @adev: amdgpu device pointer
4316  * @job: which job trigger hang
4317  *
4318  * Attempt to reset the GPU if it has hung (all asics).
4319  * Attempt to do soft-reset or full-reset and reinitialize Asic
4320  * Returns 0 for success or an error on failure.
4321  */
4322
4323 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4324                               struct amdgpu_job *job)
4325 {
4326         struct list_head device_list, *device_list_handle =  NULL;
4327         bool need_full_reset = false;
4328         bool job_signaled = false;
4329         struct amdgpu_hive_info *hive = NULL;
4330         struct amdgpu_device *tmp_adev = NULL;
4331         int i, r = 0;
4332         bool need_emergency_restart = false;
4333         bool audio_suspended = false;
4334
4335         /**
4336          * Special case: RAS triggered and full reset isn't supported
4337          */
4338         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4339
4340         /*
4341          * Flush RAM to disk so that after reboot
4342          * the user can read log and see why the system rebooted.
4343          */
4344         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4345                 DRM_WARN("Emergency reboot.");
4346
4347                 ksys_sync_helper();
4348                 emergency_restart();
4349         }
4350
4351         dev_info(adev->dev, "GPU %s begin!\n",
4352                 need_emergency_restart ? "jobs stop":"reset");
4353
4354         /*
4355          * Here we trylock to avoid chain of resets executing from
4356          * either trigger by jobs on different adevs in XGMI hive or jobs on
4357          * different schedulers for same device while this TO handler is running.
4358          * We always reset all schedulers for device and all devices for XGMI
4359          * hive so that should take care of them too.
4360          */
4361         hive = amdgpu_get_xgmi_hive(adev);
4362         if (hive) {
4363                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4364                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4365                                 job ? job->base.id : -1, hive->hive_id);
4366                         amdgpu_put_xgmi_hive(hive);
4367                         return 0;
4368                 }
4369                 mutex_lock(&hive->hive_lock);
4370         }
4371
4372         /*
4373          * Build list of devices to reset.
4374          * In case we are in XGMI hive mode, resort the device list
4375          * to put adev in the 1st position.
4376          */
4377         INIT_LIST_HEAD(&device_list);
4378         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4379                 if (!hive)
4380                         return -ENODEV;
4381                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4382                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4383                 device_list_handle = &hive->device_list;
4384         } else {
4385                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4386                 device_list_handle = &device_list;
4387         }
4388
4389         /* block all schedulers and reset given job's ring */
4390         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4391                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4392                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4393                                   job ? job->base.id : -1);
4394                         r = 0;
4395                         goto skip_recovery;
4396                 }
4397
4398                 /*
4399                  * Try to put the audio codec into suspend state
4400                  * before gpu reset started.
4401                  *
4402                  * Due to the power domain of the graphics device
4403                  * is shared with AZ power domain. Without this,
4404                  * we may change the audio hardware from behind
4405                  * the audio driver's back. That will trigger
4406                  * some audio codec errors.
4407                  */
4408                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4409                         audio_suspended = true;
4410
4411                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4412
4413                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4414
4415                 if (!amdgpu_sriov_vf(tmp_adev))
4416                         amdgpu_amdkfd_pre_reset(tmp_adev);
4417
4418                 /*
4419                  * Mark these ASICs to be reseted as untracked first
4420                  * And add them back after reset completed
4421                  */
4422                 amdgpu_unregister_gpu_instance(tmp_adev);
4423
4424                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4425
4426                 /* disable ras on ALL IPs */
4427                 if (!need_emergency_restart &&
4428                       amdgpu_device_ip_need_full_reset(tmp_adev))
4429                         amdgpu_ras_suspend(tmp_adev);
4430
4431                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4432                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4433
4434                         if (!ring || !ring->sched.thread)
4435                                 continue;
4436
4437                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4438
4439                         if (need_emergency_restart)
4440                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4441                 }
4442         }
4443
4444         if (need_emergency_restart)
4445                 goto skip_sched_resume;
4446
4447         /*
4448          * Must check guilty signal here since after this point all old
4449          * HW fences are force signaled.
4450          *
4451          * job->base holds a reference to parent fence
4452          */
4453         if (job && job->base.s_fence->parent &&
4454             dma_fence_is_signaled(job->base.s_fence->parent)) {
4455                 job_signaled = true;
4456                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4457                 goto skip_hw_reset;
4458         }
4459
4460 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4461         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4462                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4463                                                  NULL,
4464                                                  &need_full_reset);
4465                 /*TODO Should we stop ?*/
4466                 if (r) {
4467                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4468                                   r, adev_to_drm(tmp_adev)->unique);
4469                         tmp_adev->asic_reset_res = r;
4470                 }
4471         }
4472
4473         /* Actual ASIC resets if needed.*/
4474         /* TODO Implement XGMI hive reset logic for SRIOV */
4475         if (amdgpu_sriov_vf(adev)) {
4476                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4477                 if (r)
4478                         adev->asic_reset_res = r;
4479         } else {
4480                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4481                 if (r && r == -EAGAIN)
4482                         goto retry;
4483         }
4484
4485 skip_hw_reset:
4486
4487         /* Post ASIC reset for all devs .*/
4488         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4489
4490                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4491                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4492
4493                         if (!ring || !ring->sched.thread)
4494                                 continue;
4495
4496                         /* No point to resubmit jobs if we didn't HW reset*/
4497                         if (!tmp_adev->asic_reset_res && !job_signaled)
4498                                 drm_sched_resubmit_jobs(&ring->sched);
4499
4500                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4501                 }
4502
4503                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4504                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4505                 }
4506
4507                 tmp_adev->asic_reset_res = 0;
4508
4509                 if (r) {
4510                         /* bad news, how to tell it to userspace ? */
4511                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4512                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4513                 } else {
4514                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4515                 }
4516         }
4517
4518 skip_sched_resume:
4519         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4520                 /*unlock kfd: SRIOV would do it separately */
4521                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4522                         amdgpu_amdkfd_post_reset(tmp_adev);
4523                 if (audio_suspended)
4524                         amdgpu_device_resume_display_audio(tmp_adev);
4525                 amdgpu_device_unlock_adev(tmp_adev);
4526         }
4527
4528 skip_recovery:
4529         if (hive) {
4530                 atomic_set(&hive->in_reset, 0);
4531                 mutex_unlock(&hive->hive_lock);
4532                 amdgpu_put_xgmi_hive(hive);
4533         }
4534
4535         if (r)
4536                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4537         return r;
4538 }
4539
4540 /**
4541  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4542  *
4543  * @adev: amdgpu_device pointer
4544  *
4545  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4546  * and lanes) of the slot the device is in. Handles APUs and
4547  * virtualized environments where PCIE config space may not be available.
4548  */
4549 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4550 {
4551         struct pci_dev *pdev;
4552         enum pci_bus_speed speed_cap, platform_speed_cap;
4553         enum pcie_link_width platform_link_width;
4554
4555         if (amdgpu_pcie_gen_cap)
4556                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4557
4558         if (amdgpu_pcie_lane_cap)
4559                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4560
4561         /* covers APUs as well */
4562         if (pci_is_root_bus(adev->pdev->bus)) {
4563                 if (adev->pm.pcie_gen_mask == 0)
4564                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4565                 if (adev->pm.pcie_mlw_mask == 0)
4566                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4567                 return;
4568         }
4569
4570         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4571                 return;
4572
4573         pcie_bandwidth_available(adev->pdev, NULL,
4574                                  &platform_speed_cap, &platform_link_width);
4575
4576         if (adev->pm.pcie_gen_mask == 0) {
4577                 /* asic caps */
4578                 pdev = adev->pdev;
4579                 speed_cap = pcie_get_speed_cap(pdev);
4580                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4581                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4582                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4583                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4584                 } else {
4585                         if (speed_cap == PCIE_SPEED_16_0GT)
4586                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4587                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4588                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4589                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4590                         else if (speed_cap == PCIE_SPEED_8_0GT)
4591                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4592                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4593                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4594                         else if (speed_cap == PCIE_SPEED_5_0GT)
4595                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4596                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4597                         else
4598                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4599                 }
4600                 /* platform caps */
4601                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4602                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4603                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4604                 } else {
4605                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4606                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4607                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4608                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4609                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4610                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4611                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4612                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4613                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4614                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4615                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4616                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4617                         else
4618                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4619
4620                 }
4621         }
4622         if (adev->pm.pcie_mlw_mask == 0) {
4623                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4624                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4625                 } else {
4626                         switch (platform_link_width) {
4627                         case PCIE_LNK_X32:
4628                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4629                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4630                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4631                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4632                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4633                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4634                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4635                                 break;
4636                         case PCIE_LNK_X16:
4637                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4638                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4639                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4640                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4641                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4642                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4643                                 break;
4644                         case PCIE_LNK_X12:
4645                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4646                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4647                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4648                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4649                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4650                                 break;
4651                         case PCIE_LNK_X8:
4652                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4653                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4654                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4655                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4656                                 break;
4657                         case PCIE_LNK_X4:
4658                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4659                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4660                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4661                                 break;
4662                         case PCIE_LNK_X2:
4663                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4664                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4665                                 break;
4666                         case PCIE_LNK_X1:
4667                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4668                                 break;
4669                         default:
4670                                 break;
4671                         }
4672                 }
4673         }
4674 }
4675
4676 int amdgpu_device_baco_enter(struct drm_device *dev)
4677 {
4678         struct amdgpu_device *adev = drm_to_adev(dev);
4679         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4680
4681         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4682                 return -ENOTSUPP;
4683
4684         if (ras && ras->supported)
4685                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4686
4687         return amdgpu_dpm_baco_enter(adev);
4688 }
4689
4690 int amdgpu_device_baco_exit(struct drm_device *dev)
4691 {
4692         struct amdgpu_device *adev = drm_to_adev(dev);
4693         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4694         int ret = 0;
4695
4696         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4697                 return -ENOTSUPP;
4698
4699         ret = amdgpu_dpm_baco_exit(adev);
4700         if (ret)
4701                 return ret;
4702
4703         if (ras && ras->supported)
4704                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4705
4706         return 0;
4707 }
This page took 0.32788 seconds and 4 git commands to generate.