2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
24 * Authors: Dave Airlie
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
59 #ifdef CONFIG_DRM_AMDGPU_CIK
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
84 #include <drm/drm_drv.h>
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
98 #define AMDGPU_RESUME_MS 2000
99 #define AMDGPU_MAX_RETRY_LIMIT 2
100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
105 static const struct drm_driver amdgpu_kms_driver;
107 const char *amdgpu_asic_name[] = {
148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
154 struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
159 struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
183 void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
196 adev->init_lvl = &amdgpu_init_default;
201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
202 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
206 * DOC: pcie_replay_count
208 * The amdgpu driver provides a sysfs API for reporting the total number
209 * of PCIe replays (NAKs).
210 * The file pcie_replay_count is used for this and returns the total
211 * number of replays as a sum of the NAKs generated and NAKs received.
214 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
215 struct device_attribute *attr, char *buf)
217 struct drm_device *ddev = dev_get_drvdata(dev);
218 struct amdgpu_device *adev = drm_to_adev(ddev);
219 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
221 return sysfs_emit(buf, "%llu\n", cnt);
224 static DEVICE_ATTR(pcie_replay_count, 0444,
225 amdgpu_device_get_pcie_replay_count, NULL);
227 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
228 struct bin_attribute *attr, char *buf,
229 loff_t ppos, size_t count)
231 struct device *dev = kobj_to_dev(kobj);
232 struct drm_device *ddev = dev_get_drvdata(dev);
233 struct amdgpu_device *adev = drm_to_adev(ddev);
237 case AMDGPU_SYS_REG_STATE_XGMI:
238 bytes_read = amdgpu_asic_get_reg_state(
239 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
241 case AMDGPU_SYS_REG_STATE_WAFL:
242 bytes_read = amdgpu_asic_get_reg_state(
243 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
245 case AMDGPU_SYS_REG_STATE_PCIE:
246 bytes_read = amdgpu_asic_get_reg_state(
247 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
249 case AMDGPU_SYS_REG_STATE_USR:
250 bytes_read = amdgpu_asic_get_reg_state(
251 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
253 case AMDGPU_SYS_REG_STATE_USR_1:
254 bytes_read = amdgpu_asic_get_reg_state(
255 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
264 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
265 AMDGPU_SYS_REG_STATE_END);
267 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
271 if (!amdgpu_asic_get_reg_state_supported(adev))
274 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
279 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
281 if (!amdgpu_asic_get_reg_state_supported(adev))
283 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
286 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
290 if (ip_block->version->funcs->suspend) {
291 r = ip_block->version->funcs->suspend(ip_block);
293 dev_err(ip_block->adev->dev,
294 "suspend of IP block <%s> failed %d\n",
295 ip_block->version->funcs->name, r);
300 ip_block->status.hw = false;
304 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
308 if (ip_block->version->funcs->resume) {
309 r = ip_block->version->funcs->resume(ip_block);
311 dev_err(ip_block->adev->dev,
312 "resume of IP block <%s> failed %d\n",
313 ip_block->version->funcs->name, r);
318 ip_block->status.hw = true;
325 * The amdgpu driver provides a sysfs API for giving board related information.
326 * It provides the form factor information in the format
330 * Possible form factor values
332 * - "cem" - PCIE CEM card
333 * - "oam" - Open Compute Accelerator Module
334 * - "unknown" - Not known
338 static ssize_t amdgpu_device_get_board_info(struct device *dev,
339 struct device_attribute *attr,
342 struct drm_device *ddev = dev_get_drvdata(dev);
343 struct amdgpu_device *adev = drm_to_adev(ddev);
344 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
347 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
348 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
351 case AMDGPU_PKG_TYPE_CEM:
354 case AMDGPU_PKG_TYPE_OAM:
362 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
365 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
367 static struct attribute *amdgpu_board_attrs[] = {
368 &dev_attr_board_info.attr,
372 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
373 struct attribute *attr, int n)
375 struct device *dev = kobj_to_dev(kobj);
376 struct drm_device *ddev = dev_get_drvdata(dev);
377 struct amdgpu_device *adev = drm_to_adev(ddev);
379 if (adev->flags & AMD_IS_APU)
385 static const struct attribute_group amdgpu_board_attrs_group = {
386 .attrs = amdgpu_board_attrs,
387 .is_visible = amdgpu_board_attrs_is_visible
390 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
394 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
396 * @dev: drm_device pointer
398 * Returns true if the device is a dGPU with ATPX power control,
399 * otherwise return false.
401 bool amdgpu_device_supports_px(struct drm_device *dev)
403 struct amdgpu_device *adev = drm_to_adev(dev);
405 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
411 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
413 * @dev: drm_device pointer
415 * Returns true if the device is a dGPU with ACPI power control,
416 * otherwise return false.
418 bool amdgpu_device_supports_boco(struct drm_device *dev)
420 struct amdgpu_device *adev = drm_to_adev(dev);
422 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
426 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
432 * amdgpu_device_supports_baco - Does the device support BACO
434 * @dev: drm_device pointer
437 * 1 if the device supports BACO;
438 * 3 if the device supports MACO (only works if BACO is supported)
439 * otherwise return 0.
441 int amdgpu_device_supports_baco(struct drm_device *dev)
443 struct amdgpu_device *adev = drm_to_adev(dev);
445 return amdgpu_asic_supports_baco(adev);
448 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
450 struct drm_device *dev;
453 dev = adev_to_drm(adev);
455 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
456 bamaco_support = amdgpu_device_supports_baco(dev);
458 switch (amdgpu_runtime_pm) {
460 if (bamaco_support & MACO_SUPPORT) {
461 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
462 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
463 } else if (bamaco_support == BACO_SUPPORT) {
464 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
465 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
469 if (bamaco_support & BACO_SUPPORT) {
470 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
471 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
476 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
477 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
478 dev_info(adev->dev, "Using ATPX for runtime pm\n");
479 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
480 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
481 dev_info(adev->dev, "Using BOCO for runtime pm\n");
486 switch (adev->asic_type) {
489 /* BACO are not supported on vega20 and arctrus */
492 /* enable BACO as runpm mode if noretry=0 */
493 if (!adev->gmc.noretry)
494 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
497 /* enable BACO as runpm mode on CI+ */
498 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
502 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
503 if (bamaco_support & MACO_SUPPORT) {
504 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
505 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
507 dev_info(adev->dev, "Using BACO for runtime pm\n");
513 dev_info(adev->dev, "runtime pm is manually disabled\n");
520 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
521 dev_info(adev->dev, "Runtime PM not available\n");
524 * amdgpu_device_supports_smart_shift - Is the device dGPU with
525 * smart shift support
527 * @dev: drm_device pointer
529 * Returns true if the device is a dGPU with Smart Shift support,
530 * otherwise returns false.
532 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
534 return (amdgpu_device_supports_boco(dev) &&
535 amdgpu_acpi_is_power_shift_control_supported());
539 * VRAM access helper functions
543 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
545 * @adev: amdgpu_device pointer
546 * @pos: offset of the buffer in vram
547 * @buf: virtual address of the buffer in system memory
548 * @size: read/write size, sizeof(@buf) must > @size
549 * @write: true - write to vram, otherwise - read from vram
551 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
552 void *buf, size_t size, bool write)
555 uint32_t hi = ~0, tmp = 0;
556 uint32_t *data = buf;
560 if (!drm_dev_enter(adev_to_drm(adev), &idx))
563 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
565 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
566 for (last = pos + size; pos < last; pos += 4) {
569 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
571 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
575 WREG32_NO_KIQ(mmMM_DATA, *data++);
577 *data++ = RREG32_NO_KIQ(mmMM_DATA);
580 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
585 * amdgpu_device_aper_access - access vram by vram aperture
587 * @adev: amdgpu_device pointer
588 * @pos: offset of the buffer in vram
589 * @buf: virtual address of the buffer in system memory
590 * @size: read/write size, sizeof(@buf) must > @size
591 * @write: true - write to vram, otherwise - read from vram
593 * The return value means how many bytes have been transferred.
595 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
596 void *buf, size_t size, bool write)
603 if (!adev->mman.aper_base_kaddr)
606 last = min(pos + size, adev->gmc.visible_vram_size);
608 addr = adev->mman.aper_base_kaddr + pos;
612 memcpy_toio(addr, buf, count);
613 /* Make sure HDP write cache flush happens without any reordering
614 * after the system memory contents are sent over PCIe device
617 amdgpu_device_flush_hdp(adev, NULL);
619 amdgpu_device_invalidate_hdp(adev, NULL);
620 /* Make sure HDP read cache is invalidated before issuing a read
624 memcpy_fromio(buf, addr, count);
636 * amdgpu_device_vram_access - read/write a buffer in vram
638 * @adev: amdgpu_device pointer
639 * @pos: offset of the buffer in vram
640 * @buf: virtual address of the buffer in system memory
641 * @size: read/write size, sizeof(@buf) must > @size
642 * @write: true - write to vram, otherwise - read from vram
644 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
645 void *buf, size_t size, bool write)
649 /* try to using vram apreature to access vram first */
650 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
653 /* using MM to access rest vram */
656 amdgpu_device_mm_access(adev, pos, buf, size, write);
661 * register access helper functions.
664 /* Check if hw access should be skipped because of hotplug or device error */
665 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
667 if (adev->no_hw_access)
670 #ifdef CONFIG_LOCKDEP
672 * This is a bit complicated to understand, so worth a comment. What we assert
673 * here is that the GPU reset is not running on another thread in parallel.
675 * For this we trylock the read side of the reset semaphore, if that succeeds
676 * we know that the reset is not running in parallel.
678 * If the trylock fails we assert that we are either already holding the read
679 * side of the lock or are the reset thread itself and hold the write side of
683 if (down_read_trylock(&adev->reset_domain->sem))
684 up_read(&adev->reset_domain->sem);
686 lockdep_assert_held(&adev->reset_domain->sem);
693 * amdgpu_device_rreg - read a memory mapped IO or indirect register
695 * @adev: amdgpu_device pointer
696 * @reg: dword aligned register offset
697 * @acc_flags: access flags which require special behavior
699 * Returns the 32 bit value from the offset specified.
701 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
702 uint32_t reg, uint32_t acc_flags)
706 if (amdgpu_device_skip_hw_access(adev))
709 if ((reg * 4) < adev->rmmio_size) {
710 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
711 amdgpu_sriov_runtime(adev) &&
712 down_read_trylock(&adev->reset_domain->sem)) {
713 ret = amdgpu_kiq_rreg(adev, reg, 0);
714 up_read(&adev->reset_domain->sem);
716 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
719 ret = adev->pcie_rreg(adev, reg * 4);
722 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
728 * MMIO register read with bytes helper functions
729 * @offset:bytes offset from MMIO start
733 * amdgpu_mm_rreg8 - read a memory mapped IO register
735 * @adev: amdgpu_device pointer
736 * @offset: byte aligned register offset
738 * Returns the 8 bit value from the offset specified.
740 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
742 if (amdgpu_device_skip_hw_access(adev))
745 if (offset < adev->rmmio_size)
746 return (readb(adev->rmmio + offset));
752 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
754 * @adev: amdgpu_device pointer
755 * @reg: dword aligned register offset
756 * @acc_flags: access flags which require special behavior
757 * @xcc_id: xcc accelerated compute core id
759 * Returns the 32 bit value from the offset specified.
761 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
762 uint32_t reg, uint32_t acc_flags,
765 uint32_t ret, rlcg_flag;
767 if (amdgpu_device_skip_hw_access(adev))
770 if ((reg * 4) < adev->rmmio_size) {
771 if (amdgpu_sriov_vf(adev) &&
772 !amdgpu_sriov_runtime(adev) &&
773 adev->gfx.rlc.rlcg_reg_access_supported &&
774 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
777 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
778 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
779 amdgpu_sriov_runtime(adev) &&
780 down_read_trylock(&adev->reset_domain->sem)) {
781 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
782 up_read(&adev->reset_domain->sem);
784 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
787 ret = adev->pcie_rreg(adev, reg * 4);
794 * MMIO register write with bytes helper functions
795 * @offset:bytes offset from MMIO start
796 * @value: the value want to be written to the register
800 * amdgpu_mm_wreg8 - read a memory mapped IO register
802 * @adev: amdgpu_device pointer
803 * @offset: byte aligned register offset
804 * @value: 8 bit value to write
806 * Writes the value specified to the offset specified.
808 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
810 if (amdgpu_device_skip_hw_access(adev))
813 if (offset < adev->rmmio_size)
814 writeb(value, adev->rmmio + offset);
820 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
822 * @adev: amdgpu_device pointer
823 * @reg: dword aligned register offset
824 * @v: 32 bit value to write to the register
825 * @acc_flags: access flags which require special behavior
827 * Writes the value specified to the offset specified.
829 void amdgpu_device_wreg(struct amdgpu_device *adev,
830 uint32_t reg, uint32_t v,
833 if (amdgpu_device_skip_hw_access(adev))
836 if ((reg * 4) < adev->rmmio_size) {
837 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
838 amdgpu_sriov_runtime(adev) &&
839 down_read_trylock(&adev->reset_domain->sem)) {
840 amdgpu_kiq_wreg(adev, reg, v, 0);
841 up_read(&adev->reset_domain->sem);
843 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
846 adev->pcie_wreg(adev, reg * 4, v);
849 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
853 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
855 * @adev: amdgpu_device pointer
856 * @reg: mmio/rlc register
858 * @xcc_id: xcc accelerated compute core id
860 * this function is invoked only for the debugfs register access
862 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
863 uint32_t reg, uint32_t v,
866 if (amdgpu_device_skip_hw_access(adev))
869 if (amdgpu_sriov_fullaccess(adev) &&
870 adev->gfx.rlc.funcs &&
871 adev->gfx.rlc.funcs->is_rlcg_access_range) {
872 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
873 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
874 } else if ((reg * 4) >= adev->rmmio_size) {
875 adev->pcie_wreg(adev, reg * 4, v);
877 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
882 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
884 * @adev: amdgpu_device pointer
885 * @reg: dword aligned register offset
886 * @v: 32 bit value to write to the register
887 * @acc_flags: access flags which require special behavior
888 * @xcc_id: xcc accelerated compute core id
890 * Writes the value specified to the offset specified.
892 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
893 uint32_t reg, uint32_t v,
894 uint32_t acc_flags, uint32_t xcc_id)
898 if (amdgpu_device_skip_hw_access(adev))
901 if ((reg * 4) < adev->rmmio_size) {
902 if (amdgpu_sriov_vf(adev) &&
903 !amdgpu_sriov_runtime(adev) &&
904 adev->gfx.rlc.rlcg_reg_access_supported &&
905 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
908 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
909 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
910 amdgpu_sriov_runtime(adev) &&
911 down_read_trylock(&adev->reset_domain->sem)) {
912 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
913 up_read(&adev->reset_domain->sem);
915 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
918 adev->pcie_wreg(adev, reg * 4, v);
923 * amdgpu_device_indirect_rreg - read an indirect register
925 * @adev: amdgpu_device pointer
926 * @reg_addr: indirect register address to read from
928 * Returns the value of indirect register @reg_addr
930 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
933 unsigned long flags, pcie_index, pcie_data;
934 void __iomem *pcie_index_offset;
935 void __iomem *pcie_data_offset;
938 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
939 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
941 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
942 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
943 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
945 writel(reg_addr, pcie_index_offset);
946 readl(pcie_index_offset);
947 r = readl(pcie_data_offset);
948 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
953 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
956 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
958 void __iomem *pcie_index_offset;
959 void __iomem *pcie_index_hi_offset;
960 void __iomem *pcie_data_offset;
962 if (unlikely(!adev->nbio.funcs)) {
963 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
964 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
966 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
967 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
970 if (reg_addr >> 32) {
971 if (unlikely(!adev->nbio.funcs))
972 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
974 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
979 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
980 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
981 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
982 if (pcie_index_hi != 0)
983 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
986 writel(reg_addr, pcie_index_offset);
987 readl(pcie_index_offset);
988 if (pcie_index_hi != 0) {
989 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
990 readl(pcie_index_hi_offset);
992 r = readl(pcie_data_offset);
994 /* clear the high bits */
995 if (pcie_index_hi != 0) {
996 writel(0, pcie_index_hi_offset);
997 readl(pcie_index_hi_offset);
1000 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1006 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1008 * @adev: amdgpu_device pointer
1009 * @reg_addr: indirect register address to read from
1011 * Returns the value of indirect register @reg_addr
1013 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1016 unsigned long flags, pcie_index, pcie_data;
1017 void __iomem *pcie_index_offset;
1018 void __iomem *pcie_data_offset;
1021 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1022 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1024 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1025 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1026 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1028 /* read low 32 bits */
1029 writel(reg_addr, pcie_index_offset);
1030 readl(pcie_index_offset);
1031 r = readl(pcie_data_offset);
1032 /* read high 32 bits */
1033 writel(reg_addr + 4, pcie_index_offset);
1034 readl(pcie_index_offset);
1035 r |= ((u64)readl(pcie_data_offset) << 32);
1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1041 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1044 unsigned long flags, pcie_index, pcie_data;
1045 unsigned long pcie_index_hi = 0;
1046 void __iomem *pcie_index_offset;
1047 void __iomem *pcie_index_hi_offset;
1048 void __iomem *pcie_data_offset;
1051 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1052 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1053 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1054 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1056 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1057 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1058 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1059 if (pcie_index_hi != 0)
1060 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1063 /* read low 32 bits */
1064 writel(reg_addr, pcie_index_offset);
1065 readl(pcie_index_offset);
1066 if (pcie_index_hi != 0) {
1067 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1068 readl(pcie_index_hi_offset);
1070 r = readl(pcie_data_offset);
1071 /* read high 32 bits */
1072 writel(reg_addr + 4, pcie_index_offset);
1073 readl(pcie_index_offset);
1074 if (pcie_index_hi != 0) {
1075 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1076 readl(pcie_index_hi_offset);
1078 r |= ((u64)readl(pcie_data_offset) << 32);
1080 /* clear the high bits */
1081 if (pcie_index_hi != 0) {
1082 writel(0, pcie_index_hi_offset);
1083 readl(pcie_index_hi_offset);
1086 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1092 * amdgpu_device_indirect_wreg - write an indirect register address
1094 * @adev: amdgpu_device pointer
1095 * @reg_addr: indirect register offset
1096 * @reg_data: indirect register data
1099 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1100 u32 reg_addr, u32 reg_data)
1102 unsigned long flags, pcie_index, pcie_data;
1103 void __iomem *pcie_index_offset;
1104 void __iomem *pcie_data_offset;
1106 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1107 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1109 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1110 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1111 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1113 writel(reg_addr, pcie_index_offset);
1114 readl(pcie_index_offset);
1115 writel(reg_data, pcie_data_offset);
1116 readl(pcie_data_offset);
1117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1120 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1121 u64 reg_addr, u32 reg_data)
1123 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1124 void __iomem *pcie_index_offset;
1125 void __iomem *pcie_index_hi_offset;
1126 void __iomem *pcie_data_offset;
1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1130 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1135 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1136 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1137 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1138 if (pcie_index_hi != 0)
1139 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1142 writel(reg_addr, pcie_index_offset);
1143 readl(pcie_index_offset);
1144 if (pcie_index_hi != 0) {
1145 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1146 readl(pcie_index_hi_offset);
1148 writel(reg_data, pcie_data_offset);
1149 readl(pcie_data_offset);
1151 /* clear the high bits */
1152 if (pcie_index_hi != 0) {
1153 writel(0, pcie_index_hi_offset);
1154 readl(pcie_index_hi_offset);
1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1161 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1163 * @adev: amdgpu_device pointer
1164 * @reg_addr: indirect register offset
1165 * @reg_data: indirect register data
1168 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1169 u32 reg_addr, u64 reg_data)
1171 unsigned long flags, pcie_index, pcie_data;
1172 void __iomem *pcie_index_offset;
1173 void __iomem *pcie_data_offset;
1175 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1176 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1178 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1179 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1180 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1182 /* write low 32 bits */
1183 writel(reg_addr, pcie_index_offset);
1184 readl(pcie_index_offset);
1185 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1186 readl(pcie_data_offset);
1187 /* write high 32 bits */
1188 writel(reg_addr + 4, pcie_index_offset);
1189 readl(pcie_index_offset);
1190 writel((u32)(reg_data >> 32), pcie_data_offset);
1191 readl(pcie_data_offset);
1192 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1195 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1196 u64 reg_addr, u64 reg_data)
1198 unsigned long flags, pcie_index, pcie_data;
1199 unsigned long pcie_index_hi = 0;
1200 void __iomem *pcie_index_offset;
1201 void __iomem *pcie_index_hi_offset;
1202 void __iomem *pcie_data_offset;
1204 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1205 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1206 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1207 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1209 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1210 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1211 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1212 if (pcie_index_hi != 0)
1213 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1216 /* write low 32 bits */
1217 writel(reg_addr, pcie_index_offset);
1218 readl(pcie_index_offset);
1219 if (pcie_index_hi != 0) {
1220 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1221 readl(pcie_index_hi_offset);
1223 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1224 readl(pcie_data_offset);
1225 /* write high 32 bits */
1226 writel(reg_addr + 4, pcie_index_offset);
1227 readl(pcie_index_offset);
1228 if (pcie_index_hi != 0) {
1229 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1230 readl(pcie_index_hi_offset);
1232 writel((u32)(reg_data >> 32), pcie_data_offset);
1233 readl(pcie_data_offset);
1235 /* clear the high bits */
1236 if (pcie_index_hi != 0) {
1237 writel(0, pcie_index_hi_offset);
1238 readl(pcie_index_hi_offset);
1241 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1245 * amdgpu_device_get_rev_id - query device rev_id
1247 * @adev: amdgpu_device pointer
1249 * Return device rev_id
1251 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1253 return adev->nbio.funcs->get_rev_id(adev);
1257 * amdgpu_invalid_rreg - dummy reg read function
1259 * @adev: amdgpu_device pointer
1260 * @reg: offset of register
1262 * Dummy register read function. Used for register blocks
1263 * that certain asics don't have (all asics).
1264 * Returns the value in the register.
1266 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1268 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1273 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1275 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1281 * amdgpu_invalid_wreg - dummy reg write function
1283 * @adev: amdgpu_device pointer
1284 * @reg: offset of register
1285 * @v: value to write to the register
1287 * Dummy register read function. Used for register blocks
1288 * that certain asics don't have (all asics).
1290 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1292 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1297 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1299 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1305 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1307 * @adev: amdgpu_device pointer
1308 * @reg: offset of register
1310 * Dummy register read function. Used for register blocks
1311 * that certain asics don't have (all asics).
1312 * Returns the value in the register.
1314 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1316 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1321 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1323 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1329 * amdgpu_invalid_wreg64 - dummy reg write function
1331 * @adev: amdgpu_device pointer
1332 * @reg: offset of register
1333 * @v: value to write to the register
1335 * Dummy register read function. Used for register blocks
1336 * that certain asics don't have (all asics).
1338 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1340 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1345 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1347 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1353 * amdgpu_block_invalid_rreg - dummy reg read function
1355 * @adev: amdgpu_device pointer
1356 * @block: offset of instance
1357 * @reg: offset of register
1359 * Dummy register read function. Used for register blocks
1360 * that certain asics don't have (all asics).
1361 * Returns the value in the register.
1363 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1364 uint32_t block, uint32_t reg)
1366 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1373 * amdgpu_block_invalid_wreg - dummy reg write function
1375 * @adev: amdgpu_device pointer
1376 * @block: offset of instance
1377 * @reg: offset of register
1378 * @v: value to write to the register
1380 * Dummy register read function. Used for register blocks
1381 * that certain asics don't have (all asics).
1383 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1385 uint32_t reg, uint32_t v)
1387 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1393 * amdgpu_device_asic_init - Wrapper for atom asic_init
1395 * @adev: amdgpu_device pointer
1397 * Does any asic specific work and then calls atom asic init.
1399 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1403 amdgpu_asic_pre_asic_init(adev);
1405 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1406 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1407 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1408 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1409 amdgpu_psp_wait_for_bootloader(adev);
1410 ret = amdgpu_atomfirmware_asic_init(adev, true);
1413 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1420 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1422 * @adev: amdgpu_device pointer
1424 * Allocates a scratch page of VRAM for use by various things in the
1427 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1429 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1430 AMDGPU_GEM_DOMAIN_VRAM |
1431 AMDGPU_GEM_DOMAIN_GTT,
1432 &adev->mem_scratch.robj,
1433 &adev->mem_scratch.gpu_addr,
1434 (void **)&adev->mem_scratch.ptr);
1438 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1440 * @adev: amdgpu_device pointer
1442 * Frees the VRAM scratch page.
1444 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1446 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1450 * amdgpu_device_program_register_sequence - program an array of registers.
1452 * @adev: amdgpu_device pointer
1453 * @registers: pointer to the register array
1454 * @array_size: size of the register array
1456 * Programs an array or registers with and or masks.
1457 * This is a helper for setting golden registers.
1459 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1460 const u32 *registers,
1461 const u32 array_size)
1463 u32 tmp, reg, and_mask, or_mask;
1469 for (i = 0; i < array_size; i += 3) {
1470 reg = registers[i + 0];
1471 and_mask = registers[i + 1];
1472 or_mask = registers[i + 2];
1474 if (and_mask == 0xffffffff) {
1479 if (adev->family >= AMDGPU_FAMILY_AI)
1480 tmp |= (or_mask & and_mask);
1489 * amdgpu_device_pci_config_reset - reset the GPU
1491 * @adev: amdgpu_device pointer
1493 * Resets the GPU using the pci config reset sequence.
1494 * Only applicable to asics prior to vega10.
1496 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1498 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1502 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1504 * @adev: amdgpu_device pointer
1506 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1508 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1510 return pci_reset_function(adev->pdev);
1514 * amdgpu_device_wb_*()
1515 * Writeback is the method by which the GPU updates special pages in memory
1516 * with the status of certain GPU events (fences, ring pointers,etc.).
1520 * amdgpu_device_wb_fini - Disable Writeback and free memory
1522 * @adev: amdgpu_device pointer
1524 * Disables Writeback and frees the Writeback memory (all asics).
1525 * Used at driver shutdown.
1527 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1529 if (adev->wb.wb_obj) {
1530 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1532 (void **)&adev->wb.wb);
1533 adev->wb.wb_obj = NULL;
1538 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1540 * @adev: amdgpu_device pointer
1542 * Initializes writeback and allocates writeback memory (all asics).
1543 * Used at driver startup.
1544 * Returns 0 on success or an -error on failure.
1546 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1550 if (adev->wb.wb_obj == NULL) {
1551 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1552 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1553 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1554 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1555 (void **)&adev->wb.wb);
1557 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1561 adev->wb.num_wb = AMDGPU_MAX_WB;
1562 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1564 /* clear wb memory */
1565 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1572 * amdgpu_device_wb_get - Allocate a wb entry
1574 * @adev: amdgpu_device pointer
1577 * Allocate a wb slot for use by the driver (all asics).
1578 * Returns 0 on success or -EINVAL on failure.
1580 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1582 unsigned long flags, offset;
1584 spin_lock_irqsave(&adev->wb.lock, flags);
1585 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1586 if (offset < adev->wb.num_wb) {
1587 __set_bit(offset, adev->wb.used);
1588 spin_unlock_irqrestore(&adev->wb.lock, flags);
1589 *wb = offset << 3; /* convert to dw offset */
1592 spin_unlock_irqrestore(&adev->wb.lock, flags);
1598 * amdgpu_device_wb_free - Free a wb entry
1600 * @adev: amdgpu_device pointer
1603 * Free a wb slot allocated for use by the driver (all asics)
1605 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1607 unsigned long flags;
1610 spin_lock_irqsave(&adev->wb.lock, flags);
1611 if (wb < adev->wb.num_wb)
1612 __clear_bit(wb, adev->wb.used);
1613 spin_unlock_irqrestore(&adev->wb.lock, flags);
1617 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1619 * @adev: amdgpu_device pointer
1621 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1622 * to fail, but if any of the BARs is not accessible after the size we abort
1623 * driver loading by returning -ENODEV.
1625 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1627 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1628 struct pci_bus *root;
1629 struct resource *res;
1634 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1638 if (amdgpu_sriov_vf(adev))
1641 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1642 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1643 DRM_WARN("System can't access extended configuration space, please check!!\n");
1645 /* skip if the bios has already enabled large BAR */
1646 if (adev->gmc.real_vram_size &&
1647 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1650 /* Check if the root BUS has 64bit memory resources */
1651 root = adev->pdev->bus;
1652 while (root->parent)
1653 root = root->parent;
1655 pci_bus_for_each_resource(root, res, i) {
1656 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1657 res->start > 0x100000000ull)
1661 /* Trying to resize is pointless without a root hub window above 4GB */
1665 /* Limit the BAR size to what is available */
1666 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1669 /* Disable memory decoding while we change the BAR addresses and size */
1670 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1671 pci_write_config_word(adev->pdev, PCI_COMMAND,
1672 cmd & ~PCI_COMMAND_MEMORY);
1674 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1675 amdgpu_doorbell_fini(adev);
1676 if (adev->asic_type >= CHIP_BONAIRE)
1677 pci_release_resource(adev->pdev, 2);
1679 pci_release_resource(adev->pdev, 0);
1681 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1683 DRM_INFO("Not enough PCI address space for a large BAR.");
1684 else if (r && r != -ENOTSUPP)
1685 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1687 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1689 /* When the doorbell or fb BAR isn't available we have no chance of
1692 r = amdgpu_doorbell_init(adev);
1693 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1696 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1701 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1703 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1710 * GPU helpers function.
1713 * amdgpu_device_need_post - check if the hw need post or not
1715 * @adev: amdgpu_device pointer
1717 * Check if the asic has been initialized (all asics) at driver startup
1718 * or post is needed if hw reset is performed.
1719 * Returns true if need or false if not.
1721 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1725 if (amdgpu_sriov_vf(adev))
1728 if (!amdgpu_device_read_bios(adev))
1731 if (amdgpu_passthrough(adev)) {
1732 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1733 * some old smc fw still need driver do vPost otherwise gpu hang, while
1734 * those smc fw version above 22.15 doesn't have this flaw, so we force
1735 * vpost executed for smc version below 22.15
1737 if (adev->asic_type == CHIP_FIJI) {
1741 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1742 /* force vPost if error occurred */
1746 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1747 release_firmware(adev->pm.fw);
1748 if (fw_ver < 0x00160e00)
1753 /* Don't post if we need to reset whole hive on init */
1754 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1757 if (adev->has_hw_reset) {
1758 adev->has_hw_reset = false;
1762 /* bios scratch used on CIK+ */
1763 if (adev->asic_type >= CHIP_BONAIRE)
1764 return amdgpu_atombios_scratch_need_asic_init(adev);
1766 /* check MEM_SIZE for older asics */
1767 reg = amdgpu_asic_get_config_memsize(adev);
1769 if ((reg != 0) && (reg != 0xffffffff))
1776 * Check whether seamless boot is supported.
1778 * So far we only support seamless boot on DCE 3.0 or later.
1779 * If users report that it works on older ASICS as well, we may
1782 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1784 switch (amdgpu_seamless) {
1792 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1797 if (!(adev->flags & AMD_IS_APU))
1800 if (adev->mman.keep_stolen_vga_memory)
1803 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1807 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1808 * don't support dynamic speed switching. Until we have confirmation from Intel
1809 * that a specific host supports it, it's safer that we keep it disabled for all.
1811 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1812 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1814 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1816 #if IS_ENABLED(CONFIG_X86)
1817 struct cpuinfo_x86 *c = &cpu_data(0);
1819 /* eGPU change speeds based on USB4 fabric conditions */
1820 if (dev_is_removable(adev->dev))
1823 if (c->x86_vendor == X86_VENDOR_INTEL)
1830 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1832 * @adev: amdgpu_device pointer
1834 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1835 * be set for this device.
1837 * Returns true if it should be used or false if not.
1839 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1841 switch (amdgpu_aspm) {
1851 if (adev->flags & AMD_IS_APU)
1853 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1855 return pcie_aspm_enabled(adev->pdev);
1858 /* if we get transitioned to only one device, take VGA back */
1860 * amdgpu_device_vga_set_decode - enable/disable vga decode
1862 * @pdev: PCI device pointer
1863 * @state: enable/disable vga decode
1865 * Enable/disable vga decode (all asics).
1866 * Returns VGA resource flags.
1868 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1871 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1873 amdgpu_asic_set_vga_state(adev, state);
1875 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1876 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1878 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1882 * amdgpu_device_check_block_size - validate the vm block size
1884 * @adev: amdgpu_device pointer
1886 * Validates the vm block size specified via module parameter.
1887 * The vm block size defines number of bits in page table versus page directory,
1888 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1889 * page table and the remaining bits are in the page directory.
1891 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1893 /* defines number of bits in page table versus page directory,
1894 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1895 * page table and the remaining bits are in the page directory
1897 if (amdgpu_vm_block_size == -1)
1900 if (amdgpu_vm_block_size < 9) {
1901 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1902 amdgpu_vm_block_size);
1903 amdgpu_vm_block_size = -1;
1908 * amdgpu_device_check_vm_size - validate the vm size
1910 * @adev: amdgpu_device pointer
1912 * Validates the vm size in GB specified via module parameter.
1913 * The VM size is the size of the GPU virtual memory space in GB.
1915 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1917 /* no need to check the default value */
1918 if (amdgpu_vm_size == -1)
1921 if (amdgpu_vm_size < 1) {
1922 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1924 amdgpu_vm_size = -1;
1928 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1931 bool is_os_64 = (sizeof(void *) == 8);
1932 uint64_t total_memory;
1933 uint64_t dram_size_seven_GB = 0x1B8000000;
1934 uint64_t dram_size_three_GB = 0xB8000000;
1936 if (amdgpu_smu_memory_pool_size == 0)
1940 DRM_WARN("Not 64-bit OS, feature not supported\n");
1944 total_memory = (uint64_t)si.totalram * si.mem_unit;
1946 if ((amdgpu_smu_memory_pool_size == 1) ||
1947 (amdgpu_smu_memory_pool_size == 2)) {
1948 if (total_memory < dram_size_three_GB)
1950 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1951 (amdgpu_smu_memory_pool_size == 8)) {
1952 if (total_memory < dram_size_seven_GB)
1955 DRM_WARN("Smu memory pool size not supported\n");
1958 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1963 DRM_WARN("No enough system memory\n");
1965 adev->pm.smu_prv_buffer_size = 0;
1968 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1970 if (!(adev->flags & AMD_IS_APU) ||
1971 adev->asic_type < CHIP_RAVEN)
1974 switch (adev->asic_type) {
1976 if (adev->pdev->device == 0x15dd)
1977 adev->apu_flags |= AMD_APU_IS_RAVEN;
1978 if (adev->pdev->device == 0x15d8)
1979 adev->apu_flags |= AMD_APU_IS_PICASSO;
1982 if ((adev->pdev->device == 0x1636) ||
1983 (adev->pdev->device == 0x164c))
1984 adev->apu_flags |= AMD_APU_IS_RENOIR;
1986 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1989 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1991 case CHIP_YELLOW_CARP:
1993 case CHIP_CYAN_SKILLFISH:
1994 if ((adev->pdev->device == 0x13FE) ||
1995 (adev->pdev->device == 0x143F))
1996 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2006 * amdgpu_device_check_arguments - validate module params
2008 * @adev: amdgpu_device pointer
2010 * Validates certain module parameters and updates
2011 * the associated values used by the driver (all asics).
2013 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2017 if (amdgpu_sched_jobs < 4) {
2018 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2020 amdgpu_sched_jobs = 4;
2021 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2022 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2024 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2027 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2028 /* gart size must be greater or equal to 32M */
2029 dev_warn(adev->dev, "gart size (%d) too small\n",
2031 amdgpu_gart_size = -1;
2034 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2035 /* gtt size must be greater or equal to 32M */
2036 dev_warn(adev->dev, "gtt size (%d) too small\n",
2038 amdgpu_gtt_size = -1;
2041 /* valid range is between 4 and 9 inclusive */
2042 if (amdgpu_vm_fragment_size != -1 &&
2043 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2044 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2045 amdgpu_vm_fragment_size = -1;
2048 if (amdgpu_sched_hw_submission < 2) {
2049 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2050 amdgpu_sched_hw_submission);
2051 amdgpu_sched_hw_submission = 2;
2052 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2053 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2054 amdgpu_sched_hw_submission);
2055 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2058 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2059 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2060 amdgpu_reset_method = -1;
2063 amdgpu_device_check_smu_prv_buffer_size(adev);
2065 amdgpu_device_check_vm_size(adev);
2067 amdgpu_device_check_block_size(adev);
2069 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2071 for (i = 0; i < MAX_XCP; i++)
2072 adev->enforce_isolation[i] = !!enforce_isolation;
2078 * amdgpu_switcheroo_set_state - set switcheroo state
2080 * @pdev: pci dev pointer
2081 * @state: vga_switcheroo state
2083 * Callback for the switcheroo driver. Suspends or resumes
2084 * the asics before or after it is powered up using ACPI methods.
2086 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2087 enum vga_switcheroo_state state)
2089 struct drm_device *dev = pci_get_drvdata(pdev);
2092 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2095 if (state == VGA_SWITCHEROO_ON) {
2096 pr_info("switched on\n");
2097 /* don't suspend or resume card normally */
2098 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2100 pci_set_power_state(pdev, PCI_D0);
2101 amdgpu_device_load_pci_state(pdev);
2102 r = pci_enable_device(pdev);
2104 DRM_WARN("pci_enable_device failed (%d)\n", r);
2105 amdgpu_device_resume(dev, true);
2107 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2109 pr_info("switched off\n");
2110 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2111 amdgpu_device_prepare(dev);
2112 amdgpu_device_suspend(dev, true);
2113 amdgpu_device_cache_pci_state(pdev);
2114 /* Shut down the device */
2115 pci_disable_device(pdev);
2116 pci_set_power_state(pdev, PCI_D3cold);
2117 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2122 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2124 * @pdev: pci dev pointer
2126 * Callback for the switcheroo driver. Check of the switcheroo
2127 * state can be changed.
2128 * Returns true if the state can be changed, false if not.
2130 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2132 struct drm_device *dev = pci_get_drvdata(pdev);
2135 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2136 * locking inversion with the driver load path. And the access here is
2137 * completely racy anyway. So don't bother with locking for now.
2139 return atomic_read(&dev->open_count) == 0;
2142 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2143 .set_gpu_state = amdgpu_switcheroo_set_state,
2145 .can_switch = amdgpu_switcheroo_can_switch,
2149 * amdgpu_device_ip_set_clockgating_state - set the CG state
2151 * @dev: amdgpu_device pointer
2152 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2153 * @state: clockgating state (gate or ungate)
2155 * Sets the requested clockgating state for all instances of
2156 * the hardware IP specified.
2157 * Returns the error code from the last instance.
2159 int amdgpu_device_ip_set_clockgating_state(void *dev,
2160 enum amd_ip_block_type block_type,
2161 enum amd_clockgating_state state)
2163 struct amdgpu_device *adev = dev;
2166 for (i = 0; i < adev->num_ip_blocks; i++) {
2167 if (!adev->ip_blocks[i].status.valid)
2169 if (adev->ip_blocks[i].version->type != block_type)
2171 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2173 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2174 &adev->ip_blocks[i], state);
2176 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2177 adev->ip_blocks[i].version->funcs->name, r);
2183 * amdgpu_device_ip_set_powergating_state - set the PG state
2185 * @dev: amdgpu_device pointer
2186 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2187 * @state: powergating state (gate or ungate)
2189 * Sets the requested powergating state for all instances of
2190 * the hardware IP specified.
2191 * Returns the error code from the last instance.
2193 int amdgpu_device_ip_set_powergating_state(void *dev,
2194 enum amd_ip_block_type block_type,
2195 enum amd_powergating_state state)
2197 struct amdgpu_device *adev = dev;
2200 for (i = 0; i < adev->num_ip_blocks; i++) {
2201 if (!adev->ip_blocks[i].status.valid)
2203 if (adev->ip_blocks[i].version->type != block_type)
2205 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2207 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2208 &adev->ip_blocks[i], state);
2210 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2211 adev->ip_blocks[i].version->funcs->name, r);
2217 * amdgpu_device_ip_get_clockgating_state - get the CG state
2219 * @adev: amdgpu_device pointer
2220 * @flags: clockgating feature flags
2222 * Walks the list of IPs on the device and updates the clockgating
2223 * flags for each IP.
2224 * Updates @flags with the feature flags for each hardware IP where
2225 * clockgating is enabled.
2227 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2232 for (i = 0; i < adev->num_ip_blocks; i++) {
2233 if (!adev->ip_blocks[i].status.valid)
2235 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2236 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2241 * amdgpu_device_ip_wait_for_idle - wait for idle
2243 * @adev: amdgpu_device pointer
2244 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2246 * Waits for the request hardware IP to be idle.
2247 * Returns 0 for success or a negative error code on failure.
2249 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2250 enum amd_ip_block_type block_type)
2254 for (i = 0; i < adev->num_ip_blocks; i++) {
2255 if (!adev->ip_blocks[i].status.valid)
2257 if (adev->ip_blocks[i].version->type == block_type) {
2258 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2259 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2260 &adev->ip_blocks[i]);
2272 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2274 * @adev: amdgpu_device pointer
2275 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2277 * Check if the hardware IP is enable or not.
2278 * Returns true if it the IP is enable, false if not.
2280 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2281 enum amd_ip_block_type block_type)
2285 for (i = 0; i < adev->num_ip_blocks; i++) {
2286 if (adev->ip_blocks[i].version->type == block_type)
2287 return adev->ip_blocks[i].status.valid;
2294 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2296 * @adev: amdgpu_device pointer
2297 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2299 * Returns a pointer to the hardware IP block structure
2300 * if it exists for the asic, otherwise NULL.
2302 struct amdgpu_ip_block *
2303 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2304 enum amd_ip_block_type type)
2308 for (i = 0; i < adev->num_ip_blocks; i++)
2309 if (adev->ip_blocks[i].version->type == type)
2310 return &adev->ip_blocks[i];
2316 * amdgpu_device_ip_block_version_cmp
2318 * @adev: amdgpu_device pointer
2319 * @type: enum amd_ip_block_type
2320 * @major: major version
2321 * @minor: minor version
2323 * return 0 if equal or greater
2324 * return 1 if smaller or the ip_block doesn't exist
2326 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2327 enum amd_ip_block_type type,
2328 u32 major, u32 minor)
2330 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2332 if (ip_block && ((ip_block->version->major > major) ||
2333 ((ip_block->version->major == major) &&
2334 (ip_block->version->minor >= minor))))
2341 * amdgpu_device_ip_block_add
2343 * @adev: amdgpu_device pointer
2344 * @ip_block_version: pointer to the IP to add
2346 * Adds the IP block driver information to the collection of IPs
2349 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2350 const struct amdgpu_ip_block_version *ip_block_version)
2352 if (!ip_block_version)
2355 switch (ip_block_version->type) {
2356 case AMD_IP_BLOCK_TYPE_VCN:
2357 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2360 case AMD_IP_BLOCK_TYPE_JPEG:
2361 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2368 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2369 ip_block_version->funcs->name);
2371 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2373 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2379 * amdgpu_device_enable_virtual_display - enable virtual display feature
2381 * @adev: amdgpu_device pointer
2383 * Enabled the virtual display feature if the user has enabled it via
2384 * the module parameter virtual_display. This feature provides a virtual
2385 * display hardware on headless boards or in virtualized environments.
2386 * This function parses and validates the configuration string specified by
2387 * the user and configures the virtual display configuration (number of
2388 * virtual connectors, crtcs, etc.) specified.
2390 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2392 adev->enable_virtual_display = false;
2394 if (amdgpu_virtual_display) {
2395 const char *pci_address_name = pci_name(adev->pdev);
2396 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2398 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2399 pciaddstr_tmp = pciaddstr;
2400 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2401 pciaddname = strsep(&pciaddname_tmp, ",");
2402 if (!strcmp("all", pciaddname)
2403 || !strcmp(pci_address_name, pciaddname)) {
2407 adev->enable_virtual_display = true;
2410 res = kstrtol(pciaddname_tmp, 10,
2418 adev->mode_info.num_crtc = num_crtc;
2420 adev->mode_info.num_crtc = 1;
2426 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2427 amdgpu_virtual_display, pci_address_name,
2428 adev->enable_virtual_display, adev->mode_info.num_crtc);
2434 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2436 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2437 adev->mode_info.num_crtc = 1;
2438 adev->enable_virtual_display = true;
2439 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2440 adev->enable_virtual_display, adev->mode_info.num_crtc);
2445 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2447 * @adev: amdgpu_device pointer
2449 * Parses the asic configuration parameters specified in the gpu info
2450 * firmware and makes them available to the driver for use in configuring
2452 * Returns 0 on success, -EINVAL on failure.
2454 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2456 const char *chip_name;
2458 const struct gpu_info_firmware_header_v1_0 *hdr;
2460 adev->firmware.gpu_info_fw = NULL;
2462 if (adev->mman.discovery_bin)
2465 switch (adev->asic_type) {
2469 chip_name = "vega10";
2472 chip_name = "vega12";
2475 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2476 chip_name = "raven2";
2477 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2478 chip_name = "picasso";
2480 chip_name = "raven";
2483 chip_name = "arcturus";
2486 chip_name = "navi12";
2490 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2491 AMDGPU_UCODE_OPTIONAL,
2492 "amdgpu/%s_gpu_info.bin", chip_name);
2495 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2500 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2501 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2503 switch (hdr->version_major) {
2506 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2507 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2508 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2511 * Should be dropped when DAL no longer needs it.
2513 if (adev->asic_type == CHIP_NAVI12)
2514 goto parse_soc_bounding_box;
2516 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2517 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2518 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2519 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2520 adev->gfx.config.max_texture_channel_caches =
2521 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2522 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2523 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2524 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2525 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2526 adev->gfx.config.double_offchip_lds_buf =
2527 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2528 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2529 adev->gfx.cu_info.max_waves_per_simd =
2530 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2531 adev->gfx.cu_info.max_scratch_slots_per_cu =
2532 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2533 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2534 if (hdr->version_minor >= 1) {
2535 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2536 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2538 adev->gfx.config.num_sc_per_sh =
2539 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2540 adev->gfx.config.num_packer_per_sc =
2541 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2544 parse_soc_bounding_box:
2546 * soc bounding box info is not integrated in disocovery table,
2547 * we always need to parse it from gpu info firmware if needed.
2549 if (hdr->version_minor == 2) {
2550 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2551 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2552 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2553 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2559 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2568 * amdgpu_device_ip_early_init - run early init for hardware IPs
2570 * @adev: amdgpu_device pointer
2572 * Early initialization pass for hardware IPs. The hardware IPs that make
2573 * up each asic are discovered each IP's early_init callback is run. This
2574 * is the first stage in initializing the asic.
2575 * Returns 0 on success, negative error code on failure.
2577 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2579 struct amdgpu_ip_block *ip_block;
2580 struct pci_dev *parent;
2584 amdgpu_device_enable_virtual_display(adev);
2586 if (amdgpu_sriov_vf(adev)) {
2587 r = amdgpu_virt_request_full_gpu(adev, true);
2592 switch (adev->asic_type) {
2593 #ifdef CONFIG_DRM_AMDGPU_SI
2599 adev->family = AMDGPU_FAMILY_SI;
2600 r = si_set_ip_blocks(adev);
2605 #ifdef CONFIG_DRM_AMDGPU_CIK
2611 if (adev->flags & AMD_IS_APU)
2612 adev->family = AMDGPU_FAMILY_KV;
2614 adev->family = AMDGPU_FAMILY_CI;
2616 r = cik_set_ip_blocks(adev);
2624 case CHIP_POLARIS10:
2625 case CHIP_POLARIS11:
2626 case CHIP_POLARIS12:
2630 if (adev->flags & AMD_IS_APU)
2631 adev->family = AMDGPU_FAMILY_CZ;
2633 adev->family = AMDGPU_FAMILY_VI;
2635 r = vi_set_ip_blocks(adev);
2640 r = amdgpu_discovery_set_ip_blocks(adev);
2646 if (amdgpu_has_atpx() &&
2647 (amdgpu_is_atpx_hybrid() ||
2648 amdgpu_has_atpx_dgpu_power_cntl()) &&
2649 ((adev->flags & AMD_IS_APU) == 0) &&
2650 !dev_is_removable(&adev->pdev->dev))
2651 adev->flags |= AMD_IS_PX;
2653 if (!(adev->flags & AMD_IS_APU)) {
2654 parent = pcie_find_root_port(adev->pdev);
2655 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2659 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2660 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2661 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2662 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2663 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2664 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2665 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2668 for (i = 0; i < adev->num_ip_blocks; i++) {
2669 ip_block = &adev->ip_blocks[i];
2671 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2672 DRM_WARN("disabled ip block: %d <%s>\n",
2673 i, adev->ip_blocks[i].version->funcs->name);
2674 adev->ip_blocks[i].status.valid = false;
2675 } else if (ip_block->version->funcs->early_init) {
2676 r = ip_block->version->funcs->early_init(ip_block);
2678 adev->ip_blocks[i].status.valid = false;
2680 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2681 adev->ip_blocks[i].version->funcs->name, r);
2684 adev->ip_blocks[i].status.valid = true;
2687 adev->ip_blocks[i].status.valid = true;
2689 /* get the vbios after the asic_funcs are set up */
2690 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2691 r = amdgpu_device_parse_gpu_info_fw(adev);
2696 if (amdgpu_device_read_bios(adev)) {
2697 if (!amdgpu_get_bios(adev))
2700 r = amdgpu_atombios_init(adev);
2702 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2703 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2708 /*get pf2vf msg info at it's earliest time*/
2709 if (amdgpu_sriov_vf(adev))
2710 amdgpu_virt_init_data_exchange(adev);
2717 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2718 if (ip_block->status.valid != false)
2719 amdgpu_amdkfd_device_probe(adev);
2721 adev->cg_flags &= amdgpu_cg_mask;
2722 adev->pg_flags &= amdgpu_pg_mask;
2727 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2731 for (i = 0; i < adev->num_ip_blocks; i++) {
2732 if (!adev->ip_blocks[i].status.sw)
2734 if (adev->ip_blocks[i].status.hw)
2736 if (!amdgpu_ip_member_of_hwini(
2737 adev, adev->ip_blocks[i].version->type))
2739 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2740 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2742 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2744 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2745 adev->ip_blocks[i].version->funcs->name, r);
2748 adev->ip_blocks[i].status.hw = true;
2755 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2759 for (i = 0; i < adev->num_ip_blocks; i++) {
2760 if (!adev->ip_blocks[i].status.sw)
2762 if (adev->ip_blocks[i].status.hw)
2764 if (!amdgpu_ip_member_of_hwini(
2765 adev, adev->ip_blocks[i].version->type))
2767 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2769 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2773 adev->ip_blocks[i].status.hw = true;
2779 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2783 uint32_t smu_version;
2785 if (adev->asic_type >= CHIP_VEGA10) {
2786 for (i = 0; i < adev->num_ip_blocks; i++) {
2787 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2790 if (!amdgpu_ip_member_of_hwini(adev,
2791 AMD_IP_BLOCK_TYPE_PSP))
2794 if (!adev->ip_blocks[i].status.sw)
2797 /* no need to do the fw loading again if already done*/
2798 if (adev->ip_blocks[i].status.hw == true)
2801 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2802 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2806 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2808 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2812 adev->ip_blocks[i].status.hw = true;
2818 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2819 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2824 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2829 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2830 struct amdgpu_ring *ring = adev->rings[i];
2832 /* No need to setup the GPU scheduler for rings that don't need it */
2833 if (!ring || ring->no_scheduler)
2836 switch (ring->funcs->type) {
2837 case AMDGPU_RING_TYPE_GFX:
2838 timeout = adev->gfx_timeout;
2840 case AMDGPU_RING_TYPE_COMPUTE:
2841 timeout = adev->compute_timeout;
2843 case AMDGPU_RING_TYPE_SDMA:
2844 timeout = adev->sdma_timeout;
2847 timeout = adev->video_timeout;
2851 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2852 DRM_SCHED_PRIORITY_COUNT,
2853 ring->num_hw_submission, 0,
2854 timeout, adev->reset_domain->wq,
2855 ring->sched_score, ring->name,
2858 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2862 r = amdgpu_uvd_entity_init(adev, ring);
2864 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2868 r = amdgpu_vce_entity_init(adev, ring);
2870 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2876 amdgpu_xcp_update_partition_sched_list(adev);
2883 * amdgpu_device_ip_init - run init for hardware IPs
2885 * @adev: amdgpu_device pointer
2887 * Main initialization pass for hardware IPs. The list of all the hardware
2888 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2889 * are run. sw_init initializes the software state associated with each IP
2890 * and hw_init initializes the hardware associated with each IP.
2891 * Returns 0 on success, negative error code on failure.
2893 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2898 r = amdgpu_ras_init(adev);
2902 for (i = 0; i < adev->num_ip_blocks; i++) {
2903 if (!adev->ip_blocks[i].status.valid)
2905 if (adev->ip_blocks[i].version->funcs->sw_init) {
2906 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2908 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2909 adev->ip_blocks[i].version->funcs->name, r);
2913 adev->ip_blocks[i].status.sw = true;
2915 if (!amdgpu_ip_member_of_hwini(
2916 adev, adev->ip_blocks[i].version->type))
2919 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2920 /* need to do common hw init early so everything is set up for gmc */
2921 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2923 DRM_ERROR("hw_init %d failed %d\n", i, r);
2926 adev->ip_blocks[i].status.hw = true;
2927 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2928 /* need to do gmc hw init early so we can allocate gpu mem */
2929 /* Try to reserve bad pages early */
2930 if (amdgpu_sriov_vf(adev))
2931 amdgpu_virt_exchange_data(adev);
2933 r = amdgpu_device_mem_scratch_init(adev);
2935 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2938 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2940 DRM_ERROR("hw_init %d failed %d\n", i, r);
2943 r = amdgpu_device_wb_init(adev);
2945 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2948 adev->ip_blocks[i].status.hw = true;
2950 /* right after GMC hw init, we create CSA */
2951 if (adev->gfx.mcbp) {
2952 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2953 AMDGPU_GEM_DOMAIN_VRAM |
2954 AMDGPU_GEM_DOMAIN_GTT,
2957 DRM_ERROR("allocate CSA failed %d\n", r);
2962 r = amdgpu_seq64_init(adev);
2964 DRM_ERROR("allocate seq64 failed %d\n", r);
2970 if (amdgpu_sriov_vf(adev))
2971 amdgpu_virt_init_data_exchange(adev);
2973 r = amdgpu_ib_pool_init(adev);
2975 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2976 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2980 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2984 r = amdgpu_device_ip_hw_init_phase1(adev);
2988 r = amdgpu_device_fw_loading(adev);
2992 r = amdgpu_device_ip_hw_init_phase2(adev);
2997 * retired pages will be loaded from eeprom and reserved here,
2998 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2999 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3000 * for I2C communication which only true at this point.
3002 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3003 * failure from bad gpu situation and stop amdgpu init process
3004 * accordingly. For other failed cases, it will still release all
3005 * the resource and print error message, rather than returning one
3006 * negative value to upper level.
3008 * Note: theoretically, this should be called before all vram allocations
3009 * to protect retired page from abusing
3011 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3012 r = amdgpu_ras_recovery_init(adev, init_badpage);
3017 * In case of XGMI grab extra reference for reset domain for this device
3019 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3020 if (amdgpu_xgmi_add_device(adev) == 0) {
3021 if (!amdgpu_sriov_vf(adev)) {
3022 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3024 if (WARN_ON(!hive)) {
3029 if (!hive->reset_domain ||
3030 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3032 amdgpu_put_xgmi_hive(hive);
3036 /* Drop the early temporary reset domain we created for device */
3037 amdgpu_reset_put_reset_domain(adev->reset_domain);
3038 adev->reset_domain = hive->reset_domain;
3039 amdgpu_put_xgmi_hive(hive);
3044 r = amdgpu_device_init_schedulers(adev);
3048 if (adev->mman.buffer_funcs_ring->sched.ready)
3049 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3051 /* Don't init kfd if whole hive need to be reset during init */
3052 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3053 kgd2kfd_init_zone_device(adev);
3054 amdgpu_amdkfd_device_init(adev);
3057 amdgpu_fru_get_product_info(adev);
3065 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3067 * @adev: amdgpu_device pointer
3069 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3070 * this function before a GPU reset. If the value is retained after a
3071 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3073 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3075 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3079 * amdgpu_device_check_vram_lost - check if vram is valid
3081 * @adev: amdgpu_device pointer
3083 * Checks the reset magic value written to the gart pointer in VRAM.
3084 * The driver calls this after a GPU reset to see if the contents of
3085 * VRAM is lost or now.
3086 * returns true if vram is lost, false if not.
3088 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3090 if (memcmp(adev->gart.ptr, adev->reset_magic,
3091 AMDGPU_RESET_MAGIC_NUM))
3094 if (!amdgpu_in_reset(adev))
3098 * For all ASICs with baco/mode1 reset, the VRAM is
3099 * always assumed to be lost.
3101 switch (amdgpu_asic_reset_method(adev)) {
3102 case AMD_RESET_METHOD_BACO:
3103 case AMD_RESET_METHOD_MODE1:
3111 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3113 * @adev: amdgpu_device pointer
3114 * @state: clockgating state (gate or ungate)
3116 * The list of all the hardware IPs that make up the asic is walked and the
3117 * set_clockgating_state callbacks are run.
3118 * Late initialization pass enabling clockgating for hardware IPs.
3119 * Fini or suspend, pass disabling clockgating for hardware IPs.
3120 * Returns 0 on success, negative error code on failure.
3123 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3124 enum amd_clockgating_state state)
3128 if (amdgpu_emu_mode == 1)
3131 for (j = 0; j < adev->num_ip_blocks; j++) {
3132 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3133 if (!adev->ip_blocks[i].status.late_initialized)
3135 /* skip CG for GFX, SDMA on S0ix */
3136 if (adev->in_s0ix &&
3137 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3138 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3140 /* skip CG for VCE/UVD, it's handled specially */
3141 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3142 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3143 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3144 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3145 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3146 /* enable clockgating to save power */
3147 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3150 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3151 adev->ip_blocks[i].version->funcs->name, r);
3160 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3161 enum amd_powergating_state state)
3165 if (amdgpu_emu_mode == 1)
3168 for (j = 0; j < adev->num_ip_blocks; j++) {
3169 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3170 if (!adev->ip_blocks[i].status.late_initialized)
3172 /* skip PG for GFX, SDMA on S0ix */
3173 if (adev->in_s0ix &&
3174 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3177 /* skip CG for VCE/UVD, it's handled specially */
3178 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3179 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3180 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3181 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3182 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3183 /* enable powergating to save power */
3184 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3187 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3188 adev->ip_blocks[i].version->funcs->name, r);
3196 static int amdgpu_device_enable_mgpu_fan_boost(void)
3198 struct amdgpu_gpu_instance *gpu_ins;
3199 struct amdgpu_device *adev;
3202 mutex_lock(&mgpu_info.mutex);
3205 * MGPU fan boost feature should be enabled
3206 * only when there are two or more dGPUs in
3209 if (mgpu_info.num_dgpu < 2)
3212 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3213 gpu_ins = &(mgpu_info.gpu_ins[i]);
3214 adev = gpu_ins->adev;
3215 if (!(adev->flags & AMD_IS_APU) &&
3216 !gpu_ins->mgpu_fan_enabled) {
3217 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3221 gpu_ins->mgpu_fan_enabled = 1;
3226 mutex_unlock(&mgpu_info.mutex);
3232 * amdgpu_device_ip_late_init - run late init for hardware IPs
3234 * @adev: amdgpu_device pointer
3236 * Late initialization pass for hardware IPs. The list of all the hardware
3237 * IPs that make up the asic is walked and the late_init callbacks are run.
3238 * late_init covers any special initialization that an IP requires
3239 * after all of the have been initialized or something that needs to happen
3240 * late in the init process.
3241 * Returns 0 on success, negative error code on failure.
3243 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3245 struct amdgpu_gpu_instance *gpu_instance;
3248 for (i = 0; i < adev->num_ip_blocks; i++) {
3249 if (!adev->ip_blocks[i].status.hw)
3251 if (adev->ip_blocks[i].version->funcs->late_init) {
3252 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3254 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3255 adev->ip_blocks[i].version->funcs->name, r);
3259 adev->ip_blocks[i].status.late_initialized = true;
3262 r = amdgpu_ras_late_init(adev);
3264 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3268 if (!amdgpu_reset_in_recovery(adev))
3269 amdgpu_ras_set_error_query_ready(adev, true);
3271 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3272 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3274 amdgpu_device_fill_reset_magic(adev);
3276 r = amdgpu_device_enable_mgpu_fan_boost();
3278 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3280 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3281 if (amdgpu_passthrough(adev) &&
3282 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3283 adev->asic_type == CHIP_ALDEBARAN))
3284 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3286 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3287 mutex_lock(&mgpu_info.mutex);
3290 * Reset device p-state to low as this was booted with high.
3292 * This should be performed only after all devices from the same
3293 * hive get initialized.
3295 * However, it's unknown how many device in the hive in advance.
3296 * As this is counted one by one during devices initializations.
3298 * So, we wait for all XGMI interlinked devices initialized.
3299 * This may bring some delays as those devices may come from
3300 * different hives. But that should be OK.
3302 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3303 for (i = 0; i < mgpu_info.num_gpu; i++) {
3304 gpu_instance = &(mgpu_info.gpu_ins[i]);
3305 if (gpu_instance->adev->flags & AMD_IS_APU)
3308 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3309 AMDGPU_XGMI_PSTATE_MIN);
3311 DRM_ERROR("pstate setting failed (%d).\n", r);
3317 mutex_unlock(&mgpu_info.mutex);
3323 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3327 if (!ip_block->version->funcs->hw_fini) {
3328 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3329 ip_block->version->funcs->name);
3331 r = ip_block->version->funcs->hw_fini(ip_block);
3332 /* XXX handle errors */
3334 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3335 ip_block->version->funcs->name, r);
3339 ip_block->status.hw = false;
3343 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3345 * @adev: amdgpu_device pointer
3347 * For ASICs need to disable SMC first
3349 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3353 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3356 for (i = 0; i < adev->num_ip_blocks; i++) {
3357 if (!adev->ip_blocks[i].status.hw)
3359 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3360 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3366 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3370 for (i = 0; i < adev->num_ip_blocks; i++) {
3371 if (!adev->ip_blocks[i].version->funcs->early_fini)
3374 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3376 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3377 adev->ip_blocks[i].version->funcs->name, r);
3381 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3382 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3384 amdgpu_amdkfd_suspend(adev, false);
3386 /* Workaround for ASICs need to disable SMC first */
3387 amdgpu_device_smu_fini_early(adev);
3389 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3390 if (!adev->ip_blocks[i].status.hw)
3393 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3396 if (amdgpu_sriov_vf(adev)) {
3397 if (amdgpu_virt_release_full_gpu(adev, false))
3398 DRM_ERROR("failed to release exclusive mode on fini\n");
3405 * amdgpu_device_ip_fini - run fini for hardware IPs
3407 * @adev: amdgpu_device pointer
3409 * Main teardown pass for hardware IPs. The list of all the hardware
3410 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3411 * are run. hw_fini tears down the hardware associated with each IP
3412 * and sw_fini tears down any software state associated with each IP.
3413 * Returns 0 on success, negative error code on failure.
3415 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3419 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3420 amdgpu_virt_release_ras_err_handler_data(adev);
3422 if (adev->gmc.xgmi.num_physical_nodes > 1)
3423 amdgpu_xgmi_remove_device(adev);
3425 amdgpu_amdkfd_device_fini_sw(adev);
3427 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3428 if (!adev->ip_blocks[i].status.sw)
3431 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3432 amdgpu_ucode_free_bo(adev);
3433 amdgpu_free_static_csa(&adev->virt.csa_obj);
3434 amdgpu_device_wb_fini(adev);
3435 amdgpu_device_mem_scratch_fini(adev);
3436 amdgpu_ib_pool_fini(adev);
3437 amdgpu_seq64_fini(adev);
3439 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3440 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3441 /* XXX handle errors */
3443 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3444 adev->ip_blocks[i].version->funcs->name, r);
3447 adev->ip_blocks[i].status.sw = false;
3448 adev->ip_blocks[i].status.valid = false;
3451 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3452 if (!adev->ip_blocks[i].status.late_initialized)
3454 if (adev->ip_blocks[i].version->funcs->late_fini)
3455 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3456 adev->ip_blocks[i].status.late_initialized = false;
3459 amdgpu_ras_fini(adev);
3465 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3467 * @work: work_struct.
3469 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3471 struct amdgpu_device *adev =
3472 container_of(work, struct amdgpu_device, delayed_init_work.work);
3475 r = amdgpu_ib_ring_tests(adev);
3477 DRM_ERROR("ib ring test failed (%d).\n", r);
3480 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3482 struct amdgpu_device *adev =
3483 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3485 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3486 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3488 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3489 adev->gfx.gfx_off_state = true;
3493 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3495 * @adev: amdgpu_device pointer
3497 * Main suspend function for hardware IPs. The list of all the hardware
3498 * IPs that make up the asic is walked, clockgating is disabled and the
3499 * suspend callbacks are run. suspend puts the hardware and software state
3500 * in each IP into a state suitable for suspend.
3501 * Returns 0 on success, negative error code on failure.
3503 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3507 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3508 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3511 * Per PMFW team's suggestion, driver needs to handle gfxoff
3512 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3513 * scenario. Add the missing df cstate disablement here.
3515 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3516 dev_warn(adev->dev, "Failed to disallow df cstate");
3518 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3519 if (!adev->ip_blocks[i].status.valid)
3522 /* displays are handled separately */
3523 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3526 /* XXX handle errors */
3527 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3536 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3538 * @adev: amdgpu_device pointer
3540 * Main suspend function for hardware IPs. The list of all the hardware
3541 * IPs that make up the asic is walked, clockgating is disabled and the
3542 * suspend callbacks are run. suspend puts the hardware and software state
3543 * in each IP into a state suitable for suspend.
3544 * Returns 0 on success, negative error code on failure.
3546 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3551 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3553 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3554 if (!adev->ip_blocks[i].status.valid)
3556 /* displays are handled in phase1 */
3557 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3559 /* PSP lost connection when err_event_athub occurs */
3560 if (amdgpu_ras_intr_triggered() &&
3561 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3562 adev->ip_blocks[i].status.hw = false;
3566 /* skip unnecessary suspend if we do not initialize them yet */
3567 if (!amdgpu_ip_member_of_hwini(
3568 adev, adev->ip_blocks[i].version->type))
3571 /* skip suspend of gfx/mes and psp for S0ix
3572 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3573 * like at runtime. PSP is also part of the always on hardware
3574 * so no need to suspend it.
3576 if (adev->in_s0ix &&
3577 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3582 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3583 if (adev->in_s0ix &&
3584 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3585 IP_VERSION(5, 0, 0)) &&
3586 (adev->ip_blocks[i].version->type ==
3587 AMD_IP_BLOCK_TYPE_SDMA))
3590 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3591 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3592 * from this location and RLC Autoload automatically also gets loaded
3593 * from here based on PMFW -> PSP message during re-init sequence.
3594 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3595 * the TMR and reload FWs again for IMU enabled APU ASICs.
3597 if (amdgpu_in_reset(adev) &&
3598 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3599 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3602 /* XXX handle errors */
3603 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3604 adev->ip_blocks[i].status.hw = false;
3606 /* handle putting the SMC in the appropriate state */
3607 if (!amdgpu_sriov_vf(adev)) {
3608 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3609 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3611 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3612 adev->mp1_state, r);
3623 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3625 * @adev: amdgpu_device pointer
3627 * Main suspend function for hardware IPs. The list of all the hardware
3628 * IPs that make up the asic is walked, clockgating is disabled and the
3629 * suspend callbacks are run. suspend puts the hardware and software state
3630 * in each IP into a state suitable for suspend.
3631 * Returns 0 on success, negative error code on failure.
3633 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3637 if (amdgpu_sriov_vf(adev)) {
3638 amdgpu_virt_fini_data_exchange(adev);
3639 amdgpu_virt_request_full_gpu(adev, false);
3642 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3644 r = amdgpu_device_ip_suspend_phase1(adev);
3647 r = amdgpu_device_ip_suspend_phase2(adev);
3649 if (amdgpu_sriov_vf(adev))
3650 amdgpu_virt_release_full_gpu(adev, false);
3655 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3659 static enum amd_ip_block_type ip_order[] = {
3660 AMD_IP_BLOCK_TYPE_COMMON,
3661 AMD_IP_BLOCK_TYPE_GMC,
3662 AMD_IP_BLOCK_TYPE_PSP,
3663 AMD_IP_BLOCK_TYPE_IH,
3666 for (i = 0; i < adev->num_ip_blocks; i++) {
3668 struct amdgpu_ip_block *block;
3670 block = &adev->ip_blocks[i];
3671 block->status.hw = false;
3673 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3675 if (block->version->type != ip_order[j] ||
3676 !block->status.valid)
3679 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3681 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3682 block->version->funcs->name);
3685 block->status.hw = true;
3692 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3694 struct amdgpu_ip_block *block;
3697 static enum amd_ip_block_type ip_order[] = {
3698 AMD_IP_BLOCK_TYPE_SMC,
3699 AMD_IP_BLOCK_TYPE_DCE,
3700 AMD_IP_BLOCK_TYPE_GFX,
3701 AMD_IP_BLOCK_TYPE_SDMA,
3702 AMD_IP_BLOCK_TYPE_MES,
3703 AMD_IP_BLOCK_TYPE_UVD,
3704 AMD_IP_BLOCK_TYPE_VCE,
3705 AMD_IP_BLOCK_TYPE_VCN,
3706 AMD_IP_BLOCK_TYPE_JPEG
3709 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3710 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3715 if (block->status.valid && !block->status.hw) {
3716 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3717 r = amdgpu_ip_block_resume(block);
3719 r = block->version->funcs->hw_init(block);
3723 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3724 block->version->funcs->name);
3727 block->status.hw = true;
3735 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3737 * @adev: amdgpu_device pointer
3739 * First resume function for hardware IPs. The list of all the hardware
3740 * IPs that make up the asic is walked and the resume callbacks are run for
3741 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3742 * after a suspend and updates the software state as necessary. This
3743 * function is also used for restoring the GPU after a GPU reset.
3744 * Returns 0 on success, negative error code on failure.
3746 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3750 for (i = 0; i < adev->num_ip_blocks; i++) {
3751 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3753 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3756 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3758 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3768 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3770 * @adev: amdgpu_device pointer
3772 * Second resume function for hardware IPs. The list of all the hardware
3773 * IPs that make up the asic is walked and the resume callbacks are run for
3774 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3775 * functional state after a suspend and updates the software state as
3776 * necessary. This function is also used for restoring the GPU after a GPU
3778 * Returns 0 on success, negative error code on failure.
3780 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3784 for (i = 0; i < adev->num_ip_blocks; i++) {
3785 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3787 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3788 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3789 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3790 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3791 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3793 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3802 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3804 * @adev: amdgpu_device pointer
3806 * Third resume function for hardware IPs. The list of all the hardware
3807 * IPs that make up the asic is walked and the resume callbacks are run for
3808 * all DCE. resume puts the hardware into a functional state after a suspend
3809 * and updates the software state as necessary. This function is also used
3810 * for restoring the GPU after a GPU reset.
3812 * Returns 0 on success, negative error code on failure.
3814 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3818 for (i = 0; i < adev->num_ip_blocks; i++) {
3819 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3821 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3822 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3832 * amdgpu_device_ip_resume - run resume for hardware IPs
3834 * @adev: amdgpu_device pointer
3836 * Main resume function for hardware IPs. The hardware IPs
3837 * are split into two resume functions because they are
3838 * also used in recovering from a GPU reset and some additional
3839 * steps need to be take between them. In this case (S3/S4) they are
3841 * Returns 0 on success, negative error code on failure.
3843 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3847 r = amdgpu_device_ip_resume_phase1(adev);
3851 r = amdgpu_device_fw_loading(adev);
3855 r = amdgpu_device_ip_resume_phase2(adev);
3857 if (adev->mman.buffer_funcs_ring->sched.ready)
3858 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3863 amdgpu_fence_driver_hw_init(adev);
3865 r = amdgpu_device_ip_resume_phase3(adev);
3871 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3873 * @adev: amdgpu_device pointer
3875 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3877 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3879 if (amdgpu_sriov_vf(adev)) {
3880 if (adev->is_atom_fw) {
3881 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3882 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3884 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3885 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3888 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3889 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3894 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3896 * @asic_type: AMD asic type
3898 * Check if there is DC (new modesetting infrastructre) support for an asic.
3899 * returns true if DC has support, false if not.
3901 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3903 switch (asic_type) {
3904 #ifdef CONFIG_DRM_AMDGPU_SI
3908 /* chips with no display hardware */
3910 #if defined(CONFIG_DRM_AMD_DC)
3916 * We have systems in the wild with these ASICs that require
3917 * LVDS and VGA support which is not supported with DC.
3919 * Fallback to the non-DC driver here by default so as not to
3920 * cause regressions.
3922 #if defined(CONFIG_DRM_AMD_DC_SI)
3923 return amdgpu_dc > 0;
3932 * We have systems in the wild with these ASICs that require
3933 * VGA support which is not supported with DC.
3935 * Fallback to the non-DC driver here by default so as not to
3936 * cause regressions.
3938 return amdgpu_dc > 0;
3940 return amdgpu_dc != 0;
3944 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3951 * amdgpu_device_has_dc_support - check if dc is supported
3953 * @adev: amdgpu_device pointer
3955 * Returns true for supported, false for not supported
3957 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3959 if (adev->enable_virtual_display ||
3960 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3963 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3966 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3968 struct amdgpu_device *adev =
3969 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3970 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3972 /* It's a bug to not have a hive within this function */
3977 * Use task barrier to synchronize all xgmi reset works across the
3978 * hive. task_barrier_enter and task_barrier_exit will block
3979 * until all the threads running the xgmi reset works reach
3980 * those points. task_barrier_full will do both blocks.
3982 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3984 task_barrier_enter(&hive->tb);
3985 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3987 if (adev->asic_reset_res)
3990 task_barrier_exit(&hive->tb);
3991 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3993 if (adev->asic_reset_res)
3996 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3999 task_barrier_full(&hive->tb);
4000 adev->asic_reset_res = amdgpu_asic_reset(adev);
4004 if (adev->asic_reset_res)
4005 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4006 adev->asic_reset_res, adev_to_drm(adev)->unique);
4007 amdgpu_put_xgmi_hive(hive);
4010 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4012 char *input = amdgpu_lockup_timeout;
4013 char *timeout_setting = NULL;
4019 * By default timeout for non compute jobs is 10000
4020 * and 60000 for compute jobs.
4021 * In SR-IOV or passthrough mode, timeout for compute
4022 * jobs are 60000 by default.
4024 adev->gfx_timeout = msecs_to_jiffies(10000);
4025 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4026 if (amdgpu_sriov_vf(adev))
4027 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4028 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4030 adev->compute_timeout = msecs_to_jiffies(60000);
4032 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4033 while ((timeout_setting = strsep(&input, ",")) &&
4034 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4035 ret = kstrtol(timeout_setting, 0, &timeout);
4042 } else if (timeout < 0) {
4043 timeout = MAX_SCHEDULE_TIMEOUT;
4044 dev_warn(adev->dev, "lockup timeout disabled");
4045 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4047 timeout = msecs_to_jiffies(timeout);
4052 adev->gfx_timeout = timeout;
4055 adev->compute_timeout = timeout;
4058 adev->sdma_timeout = timeout;
4061 adev->video_timeout = timeout;
4068 * There is only one value specified and
4069 * it should apply to all non-compute jobs.
4072 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4073 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4074 adev->compute_timeout = adev->gfx_timeout;
4082 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4084 * @adev: amdgpu_device pointer
4086 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4088 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4090 struct iommu_domain *domain;
4092 domain = iommu_get_domain_for_dev(adev->dev);
4093 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4094 adev->ram_is_direct_mapped = true;
4097 #if defined(CONFIG_HSA_AMD_P2P)
4099 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4101 * @adev: amdgpu_device pointer
4103 * return if IOMMU remapping bar address
4105 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4107 struct iommu_domain *domain;
4109 domain = iommu_get_domain_for_dev(adev->dev);
4110 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4111 domain->type == IOMMU_DOMAIN_DMA_FQ))
4118 static const struct attribute *amdgpu_dev_attributes[] = {
4119 &dev_attr_pcie_replay_count.attr,
4123 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4125 if (amdgpu_mcbp == 1)
4126 adev->gfx.mcbp = true;
4127 else if (amdgpu_mcbp == 0)
4128 adev->gfx.mcbp = false;
4130 if (amdgpu_sriov_vf(adev))
4131 adev->gfx.mcbp = true;
4134 DRM_INFO("MCBP is enabled\n");
4138 * amdgpu_device_init - initialize the driver
4140 * @adev: amdgpu_device pointer
4141 * @flags: driver flags
4143 * Initializes the driver info and hw (all asics).
4144 * Returns 0 for success or an error on failure.
4145 * Called at driver startup.
4147 int amdgpu_device_init(struct amdgpu_device *adev,
4150 struct drm_device *ddev = adev_to_drm(adev);
4151 struct pci_dev *pdev = adev->pdev;
4157 adev->shutdown = false;
4158 adev->flags = flags;
4160 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4161 adev->asic_type = amdgpu_force_asic_type;
4163 adev->asic_type = flags & AMD_ASIC_MASK;
4165 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4166 if (amdgpu_emu_mode == 1)
4167 adev->usec_timeout *= 10;
4168 adev->gmc.gart_size = 512 * 1024 * 1024;
4169 adev->accel_working = false;
4170 adev->num_rings = 0;
4171 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4172 adev->mman.buffer_funcs = NULL;
4173 adev->mman.buffer_funcs_ring = NULL;
4174 adev->vm_manager.vm_pte_funcs = NULL;
4175 adev->vm_manager.vm_pte_num_scheds = 0;
4176 adev->gmc.gmc_funcs = NULL;
4177 adev->harvest_ip_mask = 0x0;
4178 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4179 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4181 adev->smc_rreg = &amdgpu_invalid_rreg;
4182 adev->smc_wreg = &amdgpu_invalid_wreg;
4183 adev->pcie_rreg = &amdgpu_invalid_rreg;
4184 adev->pcie_wreg = &amdgpu_invalid_wreg;
4185 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4186 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4187 adev->pciep_rreg = &amdgpu_invalid_rreg;
4188 adev->pciep_wreg = &amdgpu_invalid_wreg;
4189 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4190 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4191 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4192 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4193 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4194 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4195 adev->didt_rreg = &amdgpu_invalid_rreg;
4196 adev->didt_wreg = &amdgpu_invalid_wreg;
4197 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4198 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4199 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4200 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4202 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4203 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4204 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4206 /* mutex initialization are all done here so we
4207 * can recall function without having locking issues
4209 mutex_init(&adev->firmware.mutex);
4210 mutex_init(&adev->pm.mutex);
4211 mutex_init(&adev->gfx.gpu_clock_mutex);
4212 mutex_init(&adev->srbm_mutex);
4213 mutex_init(&adev->gfx.pipe_reserve_mutex);
4214 mutex_init(&adev->gfx.gfx_off_mutex);
4215 mutex_init(&adev->gfx.partition_mutex);
4216 mutex_init(&adev->grbm_idx_mutex);
4217 mutex_init(&adev->mn_lock);
4218 mutex_init(&adev->virt.vf_errors.lock);
4219 mutex_init(&adev->virt.rlcg_reg_lock);
4220 hash_init(adev->mn_hash);
4221 mutex_init(&adev->psp.mutex);
4222 mutex_init(&adev->notifier_lock);
4223 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4224 mutex_init(&adev->benchmark_mutex);
4225 mutex_init(&adev->gfx.reset_sem_mutex);
4226 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4227 mutex_init(&adev->enforce_isolation_mutex);
4228 mutex_init(&adev->gfx.kfd_sch_mutex);
4230 amdgpu_device_init_apu_flags(adev);
4232 r = amdgpu_device_check_arguments(adev);
4236 spin_lock_init(&adev->mmio_idx_lock);
4237 spin_lock_init(&adev->smc_idx_lock);
4238 spin_lock_init(&adev->pcie_idx_lock);
4239 spin_lock_init(&adev->uvd_ctx_idx_lock);
4240 spin_lock_init(&adev->didt_idx_lock);
4241 spin_lock_init(&adev->gc_cac_idx_lock);
4242 spin_lock_init(&adev->se_cac_idx_lock);
4243 spin_lock_init(&adev->audio_endpt_idx_lock);
4244 spin_lock_init(&adev->mm_stats.lock);
4245 spin_lock_init(&adev->wb.lock);
4247 INIT_LIST_HEAD(&adev->reset_list);
4249 INIT_LIST_HEAD(&adev->ras_list);
4251 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4253 INIT_DELAYED_WORK(&adev->delayed_init_work,
4254 amdgpu_device_delayed_init_work_handler);
4255 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4256 amdgpu_device_delay_enable_gfx_off);
4258 * Initialize the enforce_isolation work structures for each XCP
4259 * partition. This work handler is responsible for enforcing shader
4260 * isolation on AMD GPUs. It counts the number of emitted fences for
4261 * each GFX and compute ring. If there are any fences, it schedules
4262 * the `enforce_isolation_work` to be run after a delay. If there are
4263 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4266 for (i = 0; i < MAX_XCP; i++) {
4267 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4268 amdgpu_gfx_enforce_isolation_handler);
4269 adev->gfx.enforce_isolation[i].adev = adev;
4270 adev->gfx.enforce_isolation[i].xcp_id = i;
4273 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4275 adev->gfx.gfx_off_req_count = 1;
4276 adev->gfx.gfx_off_residency = 0;
4277 adev->gfx.gfx_off_entrycount = 0;
4278 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4280 atomic_set(&adev->throttling_logging_enabled, 1);
4282 * If throttling continues, logging will be performed every minute
4283 * to avoid log flooding. "-1" is subtracted since the thermal
4284 * throttling interrupt comes every second. Thus, the total logging
4285 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4286 * for throttling interrupt) = 60 seconds.
4288 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4289 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4291 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4292 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4294 /* Registers mapping */
4295 /* TODO: block userspace mapping of io register */
4296 if (adev->asic_type >= CHIP_BONAIRE) {
4297 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4298 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4300 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4301 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4304 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4305 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4307 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4311 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4312 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4315 * Reset domain needs to be present early, before XGMI hive discovered
4316 * (if any) and initialized to use reset sem and in_gpu reset flag
4317 * early on during init and before calling to RREG32.
4319 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4320 if (!adev->reset_domain)
4323 /* detect hw virtualization here */
4324 amdgpu_detect_virtualization(adev);
4326 amdgpu_device_get_pcie_info(adev);
4328 r = amdgpu_device_get_job_timeout_settings(adev);
4330 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4334 amdgpu_device_set_mcbp(adev);
4337 * By default, use default mode where all blocks are expected to be
4338 * initialized. At present a 'swinit' of blocks is required to be
4339 * completed before the need for a different level is detected.
4341 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4342 /* early init functions */
4343 r = amdgpu_device_ip_early_init(adev);
4347 /* Get rid of things like offb */
4348 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4352 /* Enable TMZ based on IP_VERSION */
4353 amdgpu_gmc_tmz_set(adev);
4355 if (amdgpu_sriov_vf(adev) &&
4356 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4357 /* VF MMIO access (except mailbox range) from CPU
4358 * will be blocked during sriov runtime
4360 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4362 amdgpu_gmc_noretry_set(adev);
4363 /* Need to get xgmi info early to decide the reset behavior*/
4364 if (adev->gmc.xgmi.supported) {
4365 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4370 /* enable PCIE atomic ops */
4371 if (amdgpu_sriov_vf(adev)) {
4372 if (adev->virt.fw_reserve.p_pf2vf)
4373 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4374 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4375 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4376 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4377 * internal path natively support atomics, set have_atomics_support to true.
4379 } else if ((adev->flags & AMD_IS_APU) &&
4380 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4381 IP_VERSION(9, 0, 0))) {
4382 adev->have_atomics_support = true;
4384 adev->have_atomics_support =
4385 !pci_enable_atomic_ops_to_root(adev->pdev,
4386 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4387 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4390 if (!adev->have_atomics_support)
4391 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4393 /* doorbell bar mapping and doorbell index init*/
4394 amdgpu_doorbell_init(adev);
4396 if (amdgpu_emu_mode == 1) {
4397 /* post the asic on emulation mode */
4398 emu_soc_asic_init(adev);
4399 goto fence_driver_init;
4402 amdgpu_reset_init(adev);
4404 /* detect if we are with an SRIOV vbios */
4406 amdgpu_device_detect_sriov_bios(adev);
4408 /* check if we need to reset the asic
4409 * E.g., driver was not cleanly unloaded previously, etc.
4411 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4412 if (adev->gmc.xgmi.num_physical_nodes) {
4413 dev_info(adev->dev, "Pending hive reset.\n");
4414 amdgpu_set_init_level(adev,
4415 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4416 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4417 !amdgpu_device_has_display_hardware(adev)) {
4418 r = psp_gpu_reset(adev);
4420 tmp = amdgpu_reset_method;
4421 /* It should do a default reset when loading or reloading the driver,
4422 * regardless of the module parameter reset_method.
4424 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4425 r = amdgpu_asic_reset(adev);
4426 amdgpu_reset_method = tmp;
4430 dev_err(adev->dev, "asic reset on init failed\n");
4435 /* Post card if necessary */
4436 if (amdgpu_device_need_post(adev)) {
4438 dev_err(adev->dev, "no vBIOS found\n");
4442 DRM_INFO("GPU posting now...\n");
4443 r = amdgpu_device_asic_init(adev);
4445 dev_err(adev->dev, "gpu post error!\n");
4451 if (adev->is_atom_fw) {
4452 /* Initialize clocks */
4453 r = amdgpu_atomfirmware_get_clock_info(adev);
4455 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4456 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4460 /* Initialize clocks */
4461 r = amdgpu_atombios_get_clock_info(adev);
4463 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4464 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4467 /* init i2c buses */
4468 if (!amdgpu_device_has_dc_support(adev))
4469 amdgpu_atombios_i2c_init(adev);
4475 r = amdgpu_fence_driver_sw_init(adev);
4477 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4478 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4482 /* init the mode config */
4483 drm_mode_config_init(adev_to_drm(adev));
4485 r = amdgpu_device_ip_init(adev);
4487 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4488 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4489 goto release_ras_con;
4492 amdgpu_fence_driver_hw_init(adev);
4495 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4496 adev->gfx.config.max_shader_engines,
4497 adev->gfx.config.max_sh_per_se,
4498 adev->gfx.config.max_cu_per_sh,
4499 adev->gfx.cu_info.number);
4501 adev->accel_working = true;
4503 amdgpu_vm_check_compute_bug(adev);
4505 /* Initialize the buffer migration limit. */
4506 if (amdgpu_moverate >= 0)
4507 max_MBps = amdgpu_moverate;
4509 max_MBps = 8; /* Allow 8 MB/s. */
4510 /* Get a log2 for easy divisions. */
4511 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4514 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4515 * Otherwise the mgpu fan boost feature will be skipped due to the
4516 * gpu instance is counted less.
4518 amdgpu_register_gpu_instance(adev);
4520 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4521 * explicit gating rather than handling it automatically.
4523 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4524 r = amdgpu_device_ip_late_init(adev);
4526 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4527 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4528 goto release_ras_con;
4531 amdgpu_ras_resume(adev);
4532 queue_delayed_work(system_wq, &adev->delayed_init_work,
4533 msecs_to_jiffies(AMDGPU_RESUME_MS));
4536 if (amdgpu_sriov_vf(adev)) {
4537 amdgpu_virt_release_full_gpu(adev, true);
4538 flush_delayed_work(&adev->delayed_init_work);
4542 * Place those sysfs registering after `late_init`. As some of those
4543 * operations performed in `late_init` might affect the sysfs
4544 * interfaces creating.
4546 r = amdgpu_atombios_sysfs_init(adev);
4548 drm_err(&adev->ddev,
4549 "registering atombios sysfs failed (%d).\n", r);
4551 r = amdgpu_pm_sysfs_init(adev);
4553 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4555 r = amdgpu_ucode_sysfs_init(adev);
4557 adev->ucode_sysfs_en = false;
4558 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4560 adev->ucode_sysfs_en = true;
4562 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4564 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4566 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4569 "Could not create amdgpu board attributes\n");
4571 amdgpu_fru_sysfs_init(adev);
4572 amdgpu_reg_state_sysfs_init(adev);
4573 amdgpu_xcp_cfg_sysfs_init(adev);
4575 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4576 r = amdgpu_pmu_init(adev);
4578 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4580 /* Have stored pci confspace at hand for restore in sudden PCI error */
4581 if (amdgpu_device_cache_pci_state(adev->pdev))
4582 pci_restore_state(pdev);
4584 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4585 /* this will fail for cards that aren't VGA class devices, just
4588 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4589 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4591 px = amdgpu_device_supports_px(ddev);
4593 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4594 apple_gmux_detect(NULL, NULL)))
4595 vga_switcheroo_register_client(adev->pdev,
4596 &amdgpu_switcheroo_ops, px);
4599 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4601 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4602 amdgpu_xgmi_reset_on_init(adev);
4604 amdgpu_device_check_iommu_direct_map(adev);
4606 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4607 r = register_pm_notifier(&adev->pm_nb);
4614 if (amdgpu_sriov_vf(adev))
4615 amdgpu_virt_release_full_gpu(adev, true);
4617 /* failed in exclusive mode due to timeout */
4618 if (amdgpu_sriov_vf(adev) &&
4619 !amdgpu_sriov_runtime(adev) &&
4620 amdgpu_virt_mmio_blocked(adev) &&
4621 !amdgpu_virt_wait_reset(adev)) {
4622 dev_err(adev->dev, "VF exclusive mode timeout\n");
4623 /* Don't send request since VF is inactive. */
4624 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4625 adev->virt.ops = NULL;
4628 amdgpu_release_ras_context(adev);
4631 amdgpu_vf_error_trans_all(adev);
4636 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4639 /* Clear all CPU mappings pointing to this device */
4640 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4642 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4643 amdgpu_doorbell_fini(adev);
4645 iounmap(adev->rmmio);
4647 if (adev->mman.aper_base_kaddr)
4648 iounmap(adev->mman.aper_base_kaddr);
4649 adev->mman.aper_base_kaddr = NULL;
4651 /* Memory manager related */
4652 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4653 arch_phys_wc_del(adev->gmc.vram_mtrr);
4654 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4659 * amdgpu_device_fini_hw - tear down the driver
4661 * @adev: amdgpu_device pointer
4663 * Tear down the driver info (all asics).
4664 * Called at driver shutdown.
4666 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4668 dev_info(adev->dev, "amdgpu: finishing device.\n");
4669 flush_delayed_work(&adev->delayed_init_work);
4671 if (adev->mman.initialized)
4672 drain_workqueue(adev->mman.bdev.wq);
4673 adev->shutdown = true;
4675 unregister_pm_notifier(&adev->pm_nb);
4677 /* make sure IB test finished before entering exclusive mode
4678 * to avoid preemption on IB test
4680 if (amdgpu_sriov_vf(adev)) {
4681 amdgpu_virt_request_full_gpu(adev, false);
4682 amdgpu_virt_fini_data_exchange(adev);
4685 /* disable all interrupts */
4686 amdgpu_irq_disable_all(adev);
4687 if (adev->mode_info.mode_config_initialized) {
4688 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4689 drm_helper_force_disable_all(adev_to_drm(adev));
4691 drm_atomic_helper_shutdown(adev_to_drm(adev));
4693 amdgpu_fence_driver_hw_fini(adev);
4695 if (adev->pm.sysfs_initialized)
4696 amdgpu_pm_sysfs_fini(adev);
4697 if (adev->ucode_sysfs_en)
4698 amdgpu_ucode_sysfs_fini(adev);
4699 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4700 amdgpu_fru_sysfs_fini(adev);
4702 amdgpu_reg_state_sysfs_fini(adev);
4703 amdgpu_xcp_cfg_sysfs_fini(adev);
4705 /* disable ras feature must before hw fini */
4706 amdgpu_ras_pre_fini(adev);
4708 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4710 amdgpu_device_ip_fini_early(adev);
4712 amdgpu_irq_fini_hw(adev);
4714 if (adev->mman.initialized)
4715 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4717 amdgpu_gart_dummy_page_fini(adev);
4719 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4720 amdgpu_device_unmap_mmio(adev);
4724 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4729 amdgpu_device_ip_fini(adev);
4730 amdgpu_fence_driver_sw_fini(adev);
4731 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4732 adev->accel_working = false;
4733 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4735 amdgpu_reset_fini(adev);
4737 /* free i2c buses */
4738 if (!amdgpu_device_has_dc_support(adev))
4739 amdgpu_i2c_fini(adev);
4741 if (amdgpu_emu_mode != 1)
4742 amdgpu_atombios_fini(adev);
4747 kfree(adev->fru_info);
4748 adev->fru_info = NULL;
4750 px = amdgpu_device_supports_px(adev_to_drm(adev));
4752 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4753 apple_gmux_detect(NULL, NULL)))
4754 vga_switcheroo_unregister_client(adev->pdev);
4757 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4759 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4760 vga_client_unregister(adev->pdev);
4762 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4764 iounmap(adev->rmmio);
4766 amdgpu_doorbell_fini(adev);
4770 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4771 amdgpu_pmu_fini(adev);
4772 if (adev->mman.discovery_bin)
4773 amdgpu_discovery_fini(adev);
4775 amdgpu_reset_put_reset_domain(adev->reset_domain);
4776 adev->reset_domain = NULL;
4778 kfree(adev->pci_state);
4783 * amdgpu_device_evict_resources - evict device resources
4784 * @adev: amdgpu device object
4786 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4787 * of the vram memory type. Mainly used for evicting device resources
4791 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4795 /* No need to evict vram on APUs unless going to S4 */
4796 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4799 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4801 DRM_WARN("evicting device resources failed\n");
4809 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4810 * @nb: notifier block
4811 * @mode: suspend mode
4814 * This function is called when the system is about to suspend or hibernate.
4815 * It is used to evict resources from the device before the system goes to
4816 * sleep while there is still access to swap.
4818 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4821 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4825 case PM_HIBERNATION_PREPARE:
4828 case PM_SUSPEND_PREPARE:
4829 r = amdgpu_device_evict_resources(adev);
4831 * This is considered non-fatal at this time because
4832 * amdgpu_device_prepare() will also fatally evict resources.
4833 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
4836 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
4844 * amdgpu_device_prepare - prepare for device suspend
4846 * @dev: drm dev pointer
4848 * Prepare to put the hw in the suspend state (all asics).
4849 * Returns 0 for success or an error on failure.
4850 * Called at driver suspend.
4852 int amdgpu_device_prepare(struct drm_device *dev)
4854 struct amdgpu_device *adev = drm_to_adev(dev);
4857 amdgpu_choose_low_power_state(adev);
4859 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4862 /* Evict the majority of BOs before starting suspend sequence */
4863 r = amdgpu_device_evict_resources(adev);
4867 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4869 for (i = 0; i < adev->num_ip_blocks; i++) {
4870 if (!adev->ip_blocks[i].status.valid)
4872 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4874 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4882 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
4888 * amdgpu_device_suspend - initiate device suspend
4890 * @dev: drm dev pointer
4891 * @notify_clients: notify in-kernel DRM clients
4893 * Puts the hw in the suspend state (all asics).
4894 * Returns 0 for success or an error on failure.
4895 * Called at driver suspend.
4897 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4899 struct amdgpu_device *adev = drm_to_adev(dev);
4902 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4905 adev->in_suspend = true;
4907 if (amdgpu_sriov_vf(adev)) {
4908 amdgpu_virt_fini_data_exchange(adev);
4909 r = amdgpu_virt_request_full_gpu(adev, false);
4914 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4915 DRM_WARN("smart shift update failed\n");
4918 drm_client_dev_suspend(adev_to_drm(adev), false);
4920 cancel_delayed_work_sync(&adev->delayed_init_work);
4922 amdgpu_ras_suspend(adev);
4924 amdgpu_device_ip_suspend_phase1(adev);
4927 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4929 r = amdgpu_device_evict_resources(adev);
4933 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4935 amdgpu_fence_driver_hw_fini(adev);
4937 amdgpu_device_ip_suspend_phase2(adev);
4939 if (amdgpu_sriov_vf(adev))
4940 amdgpu_virt_release_full_gpu(adev, false);
4942 r = amdgpu_dpm_notify_rlc_state(adev, false);
4950 * amdgpu_device_resume - initiate device resume
4952 * @dev: drm dev pointer
4953 * @notify_clients: notify in-kernel DRM clients
4955 * Bring the hw back to operating state (all asics).
4956 * Returns 0 for success or an error on failure.
4957 * Called at driver resume.
4959 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4961 struct amdgpu_device *adev = drm_to_adev(dev);
4964 if (amdgpu_sriov_vf(adev)) {
4965 r = amdgpu_virt_request_full_gpu(adev, true);
4970 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4974 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4977 if (amdgpu_device_need_post(adev)) {
4978 r = amdgpu_device_asic_init(adev);
4980 dev_err(adev->dev, "amdgpu asic init failed\n");
4983 r = amdgpu_device_ip_resume(adev);
4986 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4990 if (!adev->in_s0ix) {
4991 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4996 r = amdgpu_device_ip_late_init(adev);
5000 queue_delayed_work(system_wq, &adev->delayed_init_work,
5001 msecs_to_jiffies(AMDGPU_RESUME_MS));
5003 if (amdgpu_sriov_vf(adev)) {
5004 amdgpu_virt_init_data_exchange(adev);
5005 amdgpu_virt_release_full_gpu(adev, true);
5011 /* Make sure IB tests flushed */
5012 flush_delayed_work(&adev->delayed_init_work);
5015 drm_client_dev_resume(adev_to_drm(adev), false);
5017 amdgpu_ras_resume(adev);
5019 if (adev->mode_info.num_crtc) {
5021 * Most of the connector probing functions try to acquire runtime pm
5022 * refs to ensure that the GPU is powered on when connector polling is
5023 * performed. Since we're calling this from a runtime PM callback,
5024 * trying to acquire rpm refs will cause us to deadlock.
5026 * Since we're guaranteed to be holding the rpm lock, it's safe to
5027 * temporarily disable the rpm helpers so this doesn't deadlock us.
5030 dev->dev->power.disable_depth++;
5032 if (!adev->dc_enabled)
5033 drm_helper_hpd_irq_event(dev);
5035 drm_kms_helper_hotplug_event(dev);
5037 dev->dev->power.disable_depth--;
5040 adev->in_suspend = false;
5042 if (adev->enable_mes)
5043 amdgpu_mes_self_test(adev);
5045 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5046 DRM_WARN("smart shift update failed\n");
5052 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5054 * @adev: amdgpu_device pointer
5056 * The list of all the hardware IPs that make up the asic is walked and
5057 * the check_soft_reset callbacks are run. check_soft_reset determines
5058 * if the asic is still hung or not.
5059 * Returns true if any of the IPs are still in a hung state, false if not.
5061 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5064 bool asic_hang = false;
5066 if (amdgpu_sriov_vf(adev))
5069 if (amdgpu_asic_need_full_reset(adev))
5072 for (i = 0; i < adev->num_ip_blocks; i++) {
5073 if (!adev->ip_blocks[i].status.valid)
5075 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5076 adev->ip_blocks[i].status.hang =
5077 adev->ip_blocks[i].version->funcs->check_soft_reset(
5078 &adev->ip_blocks[i]);
5079 if (adev->ip_blocks[i].status.hang) {
5080 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5088 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5090 * @adev: amdgpu_device pointer
5092 * The list of all the hardware IPs that make up the asic is walked and the
5093 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5094 * handles any IP specific hardware or software state changes that are
5095 * necessary for a soft reset to succeed.
5096 * Returns 0 on success, negative error code on failure.
5098 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5102 for (i = 0; i < adev->num_ip_blocks; i++) {
5103 if (!adev->ip_blocks[i].status.valid)
5105 if (adev->ip_blocks[i].status.hang &&
5106 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5107 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5117 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5119 * @adev: amdgpu_device pointer
5121 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5122 * reset is necessary to recover.
5123 * Returns true if a full asic reset is required, false if not.
5125 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5129 if (amdgpu_asic_need_full_reset(adev))
5132 for (i = 0; i < adev->num_ip_blocks; i++) {
5133 if (!adev->ip_blocks[i].status.valid)
5135 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5136 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5137 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5138 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5139 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5140 if (adev->ip_blocks[i].status.hang) {
5141 dev_info(adev->dev, "Some block need full reset!\n");
5150 * amdgpu_device_ip_soft_reset - do a soft reset
5152 * @adev: amdgpu_device pointer
5154 * The list of all the hardware IPs that make up the asic is walked and the
5155 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5156 * IP specific hardware or software state changes that are necessary to soft
5158 * Returns 0 on success, negative error code on failure.
5160 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5164 for (i = 0; i < adev->num_ip_blocks; i++) {
5165 if (!adev->ip_blocks[i].status.valid)
5167 if (adev->ip_blocks[i].status.hang &&
5168 adev->ip_blocks[i].version->funcs->soft_reset) {
5169 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5179 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5181 * @adev: amdgpu_device pointer
5183 * The list of all the hardware IPs that make up the asic is walked and the
5184 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5185 * handles any IP specific hardware or software state changes that are
5186 * necessary after the IP has been soft reset.
5187 * Returns 0 on success, negative error code on failure.
5189 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5193 for (i = 0; i < adev->num_ip_blocks; i++) {
5194 if (!adev->ip_blocks[i].status.valid)
5196 if (adev->ip_blocks[i].status.hang &&
5197 adev->ip_blocks[i].version->funcs->post_soft_reset)
5198 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5207 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5209 * @adev: amdgpu_device pointer
5210 * @reset_context: amdgpu reset context pointer
5212 * do VF FLR and reinitialize Asic
5213 * return 0 means succeeded otherwise failed
5215 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5216 struct amdgpu_reset_context *reset_context)
5219 struct amdgpu_hive_info *hive = NULL;
5221 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5222 if (!amdgpu_ras_get_fed_status(adev))
5223 amdgpu_virt_ready_to_reset(adev);
5224 amdgpu_virt_wait_reset(adev);
5225 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5226 r = amdgpu_virt_request_full_gpu(adev, true);
5228 r = amdgpu_virt_reset_gpu(adev);
5233 amdgpu_ras_clear_err_state(adev);
5234 amdgpu_irq_gpu_reset_resume_helper(adev);
5236 /* some sw clean up VF needs to do before recover */
5237 amdgpu_virt_post_reset(adev);
5239 /* Resume IP prior to SMC */
5240 r = amdgpu_device_ip_reinit_early_sriov(adev);
5244 amdgpu_virt_init_data_exchange(adev);
5246 r = amdgpu_device_fw_loading(adev);
5250 /* now we are okay to resume SMC/CP/SDMA */
5251 r = amdgpu_device_ip_reinit_late_sriov(adev);
5255 hive = amdgpu_get_xgmi_hive(adev);
5256 /* Update PSP FW topology after reset */
5257 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5258 r = amdgpu_xgmi_update_topology(hive, adev);
5260 amdgpu_put_xgmi_hive(hive);
5264 r = amdgpu_ib_ring_tests(adev);
5268 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5269 amdgpu_inc_vram_lost(adev);
5271 /* need to be called during full access so we can't do it later like
5274 amdgpu_amdkfd_post_reset(adev);
5275 amdgpu_virt_release_full_gpu(adev, true);
5277 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5278 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5279 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5280 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5281 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5282 amdgpu_ras_resume(adev);
5284 amdgpu_virt_ras_telemetry_post_reset(adev);
5290 * amdgpu_device_has_job_running - check if there is any unfinished job
5292 * @adev: amdgpu_device pointer
5294 * check if there is any job running on the device when guest driver receives
5295 * FLR notification from host driver. If there are still jobs running, then
5296 * the guest driver will not respond the FLR reset. Instead, let the job hit
5297 * the timeout and guest driver then issue the reset request.
5299 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5303 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5304 struct amdgpu_ring *ring = adev->rings[i];
5306 if (!amdgpu_ring_sched_ready(ring))
5309 if (amdgpu_fence_count_emitted(ring))
5316 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5318 * @adev: amdgpu_device pointer
5320 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5323 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5326 if (amdgpu_gpu_recovery == 0)
5329 /* Skip soft reset check in fatal error mode */
5330 if (!amdgpu_ras_is_poison_mode_supported(adev))
5333 if (amdgpu_sriov_vf(adev))
5336 if (amdgpu_gpu_recovery == -1) {
5337 switch (adev->asic_type) {
5338 #ifdef CONFIG_DRM_AMDGPU_SI
5345 #ifdef CONFIG_DRM_AMDGPU_CIK
5352 case CHIP_CYAN_SKILLFISH:
5362 dev_info(adev->dev, "GPU recovery disabled.\n");
5366 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5371 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5373 dev_info(adev->dev, "GPU mode1 reset\n");
5375 /* Cache the state before bus master disable. The saved config space
5376 * values are used in other cases like restore after mode-2 reset.
5378 amdgpu_device_cache_pci_state(adev->pdev);
5381 pci_clear_master(adev->pdev);
5383 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5384 dev_info(adev->dev, "GPU smu mode1 reset\n");
5385 ret = amdgpu_dpm_mode1_reset(adev);
5387 dev_info(adev->dev, "GPU psp mode1 reset\n");
5388 ret = psp_gpu_reset(adev);
5392 goto mode1_reset_failed;
5394 amdgpu_device_load_pci_state(adev->pdev);
5395 ret = amdgpu_psp_wait_for_bootloader(adev);
5397 goto mode1_reset_failed;
5399 /* wait for asic to come out of reset */
5400 for (i = 0; i < adev->usec_timeout; i++) {
5401 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5403 if (memsize != 0xffffffff)
5408 if (i >= adev->usec_timeout) {
5410 goto mode1_reset_failed;
5413 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5418 dev_err(adev->dev, "GPU mode1 reset failed\n");
5422 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5423 struct amdgpu_reset_context *reset_context)
5426 struct amdgpu_job *job = NULL;
5427 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5428 bool need_full_reset =
5429 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5431 if (reset_context->reset_req_dev == adev)
5432 job = reset_context->job;
5434 if (amdgpu_sriov_vf(adev))
5435 amdgpu_virt_pre_reset(adev);
5437 amdgpu_fence_driver_isr_toggle(adev, true);
5439 /* block all schedulers and reset given job's ring */
5440 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5441 struct amdgpu_ring *ring = adev->rings[i];
5443 if (!amdgpu_ring_sched_ready(ring))
5446 /* Clear job fence from fence drv to avoid force_completion
5447 * leave NULL and vm flush fence in fence drv
5449 amdgpu_fence_driver_clear_job_fences(ring);
5451 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5452 amdgpu_fence_driver_force_completion(ring);
5455 amdgpu_fence_driver_isr_toggle(adev, false);
5458 drm_sched_increase_karma(&job->base);
5460 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5461 /* If reset handler not implemented, continue; otherwise return */
5462 if (r == -EOPNOTSUPP)
5467 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5468 if (!amdgpu_sriov_vf(adev)) {
5470 if (!need_full_reset)
5471 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5473 if (!need_full_reset && amdgpu_gpu_recovery &&
5474 amdgpu_device_ip_check_soft_reset(adev)) {
5475 amdgpu_device_ip_pre_soft_reset(adev);
5476 r = amdgpu_device_ip_soft_reset(adev);
5477 amdgpu_device_ip_post_soft_reset(adev);
5478 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5479 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5480 need_full_reset = true;
5484 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5485 dev_info(tmp_adev->dev, "Dumping IP State\n");
5486 /* Trigger ip dump before we reset the asic */
5487 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5488 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5489 tmp_adev->ip_blocks[i].version->funcs
5490 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5491 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5494 if (need_full_reset)
5495 r = amdgpu_device_ip_suspend(adev);
5496 if (need_full_reset)
5497 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5499 clear_bit(AMDGPU_NEED_FULL_RESET,
5500 &reset_context->flags);
5506 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5508 struct list_head *device_list_handle;
5509 bool full_reset, vram_lost = false;
5510 struct amdgpu_device *tmp_adev;
5513 device_list_handle = reset_context->reset_device_list;
5515 if (!device_list_handle)
5518 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5521 * If it's reset on init, it's default init level, otherwise keep level
5522 * as recovery level.
5524 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5525 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5527 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5530 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5531 amdgpu_set_init_level(tmp_adev, init_level);
5534 amdgpu_ras_clear_err_state(tmp_adev);
5535 r = amdgpu_device_asic_init(tmp_adev);
5537 dev_warn(tmp_adev->dev, "asic atom init failed!");
5539 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5541 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5545 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5547 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5548 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5551 DRM_INFO("VRAM is lost due to GPU reset!\n");
5552 amdgpu_inc_vram_lost(tmp_adev);
5555 r = amdgpu_device_fw_loading(tmp_adev);
5559 r = amdgpu_xcp_restore_partition_mode(
5564 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5568 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5569 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5571 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5576 amdgpu_device_fill_reset_magic(tmp_adev);
5579 * Add this ASIC as tracked as reset was already
5580 * complete successfully.
5582 amdgpu_register_gpu_instance(tmp_adev);
5584 if (!reset_context->hive &&
5585 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5586 amdgpu_xgmi_add_device(tmp_adev);
5588 r = amdgpu_device_ip_late_init(tmp_adev);
5592 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5595 * The GPU enters bad state once faulty pages
5596 * by ECC has reached the threshold, and ras
5597 * recovery is scheduled next. So add one check
5598 * here to break recovery if it indeed exceeds
5599 * bad page threshold, and remind user to
5600 * retire this GPU or setting one bigger
5601 * bad_page_threshold value to fix this once
5602 * probing driver again.
5604 if (!amdgpu_ras_is_rma(tmp_adev)) {
5606 amdgpu_ras_resume(tmp_adev);
5612 /* Update PSP FW topology after reset */
5613 if (reset_context->hive &&
5614 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5615 r = amdgpu_xgmi_update_topology(
5616 reset_context->hive, tmp_adev);
5622 /* IP init is complete now, set level as default */
5623 amdgpu_set_init_level(tmp_adev,
5624 AMDGPU_INIT_LEVEL_DEFAULT);
5625 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5626 r = amdgpu_ib_ring_tests(tmp_adev);
5628 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5635 tmp_adev->asic_reset_res = r;
5642 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5643 struct amdgpu_reset_context *reset_context)
5645 struct amdgpu_device *tmp_adev = NULL;
5646 bool need_full_reset, skip_hw_reset;
5649 /* Try reset handler method first */
5650 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5653 reset_context->reset_device_list = device_list_handle;
5654 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5655 /* If reset handler not implemented, continue; otherwise return */
5656 if (r == -EOPNOTSUPP)
5661 /* Reset handler not implemented, use the default method */
5663 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5664 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5667 * ASIC reset has to be done on all XGMI hive nodes ASAP
5668 * to allow proper links negotiation in FW (within 1 sec)
5670 if (!skip_hw_reset && need_full_reset) {
5671 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5672 /* For XGMI run all resets in parallel to speed up the process */
5673 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5674 if (!queue_work(system_unbound_wq,
5675 &tmp_adev->xgmi_reset_work))
5678 r = amdgpu_asic_reset(tmp_adev);
5681 dev_err(tmp_adev->dev,
5682 "ASIC reset failed with error, %d for drm dev, %s",
5683 r, adev_to_drm(tmp_adev)->unique);
5688 /* For XGMI wait for all resets to complete before proceed */
5690 list_for_each_entry(tmp_adev, device_list_handle,
5692 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5693 flush_work(&tmp_adev->xgmi_reset_work);
5694 r = tmp_adev->asic_reset_res;
5702 if (!r && amdgpu_ras_intr_triggered()) {
5703 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5704 amdgpu_ras_reset_error_count(tmp_adev,
5705 AMDGPU_RAS_BLOCK__MMHUB);
5708 amdgpu_ras_intr_cleared();
5711 r = amdgpu_device_reinit_after_reset(reset_context);
5713 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5715 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5721 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5724 switch (amdgpu_asic_reset_method(adev)) {
5725 case AMD_RESET_METHOD_MODE1:
5726 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5728 case AMD_RESET_METHOD_MODE2:
5729 adev->mp1_state = PP_MP1_STATE_RESET;
5732 adev->mp1_state = PP_MP1_STATE_NONE;
5737 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5739 amdgpu_vf_error_trans_all(adev);
5740 adev->mp1_state = PP_MP1_STATE_NONE;
5743 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5745 struct pci_dev *p = NULL;
5747 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5748 adev->pdev->bus->number, 1);
5750 pm_runtime_enable(&(p->dev));
5751 pm_runtime_resume(&(p->dev));
5757 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5759 enum amd_reset_method reset_method;
5760 struct pci_dev *p = NULL;
5764 * For now, only BACO and mode1 reset are confirmed
5765 * to suffer the audio issue without proper suspended.
5767 reset_method = amdgpu_asic_reset_method(adev);
5768 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5769 (reset_method != AMD_RESET_METHOD_MODE1))
5772 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5773 adev->pdev->bus->number, 1);
5777 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5780 * If we cannot get the audio device autosuspend delay,
5781 * a fixed 4S interval will be used. Considering 3S is
5782 * the audio controller default autosuspend delay setting.
5783 * 4S used here is guaranteed to cover that.
5785 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5787 while (!pm_runtime_status_suspended(&(p->dev))) {
5788 if (!pm_runtime_suspend(&(p->dev)))
5791 if (expires < ktime_get_mono_fast_ns()) {
5792 dev_warn(adev->dev, "failed to suspend display audio\n");
5794 /* TODO: abort the succeeding gpu reset? */
5799 pm_runtime_disable(&(p->dev));
5805 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5807 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5809 #if defined(CONFIG_DEBUG_FS)
5810 if (!amdgpu_sriov_vf(adev))
5811 cancel_work(&adev->reset_work);
5815 cancel_work(&adev->kfd.reset_work);
5817 if (amdgpu_sriov_vf(adev))
5818 cancel_work(&adev->virt.flr_work);
5820 if (con && adev->ras_enabled)
5821 cancel_work(&con->recovery_work);
5825 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5827 struct amdgpu_device *tmp_adev;
5831 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5832 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5833 if (PCI_POSSIBLE_ERROR(status)) {
5834 dev_err(tmp_adev->dev, "device lost from bus!");
5843 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5845 * @adev: amdgpu_device pointer
5846 * @job: which job trigger hang
5847 * @reset_context: amdgpu reset context pointer
5849 * Attempt to reset the GPU if it has hung (all asics).
5850 * Attempt to do soft-reset or full-reset and reinitialize Asic
5851 * Returns 0 for success or an error on failure.
5854 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5855 struct amdgpu_job *job,
5856 struct amdgpu_reset_context *reset_context)
5858 struct list_head device_list, *device_list_handle = NULL;
5859 bool job_signaled = false;
5860 struct amdgpu_hive_info *hive = NULL;
5861 struct amdgpu_device *tmp_adev = NULL;
5863 bool need_emergency_restart = false;
5864 bool audio_suspended = false;
5865 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5868 * If it reaches here because of hang/timeout and a RAS error is
5869 * detected at the same time, let RAS recovery take care of it.
5871 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
5872 !amdgpu_sriov_vf(adev) &&
5873 reset_context->src != AMDGPU_RESET_SRC_RAS) {
5875 "Gpu recovery from source: %d yielding to RAS error recovery handling",
5876 reset_context->src);
5880 * Special case: RAS triggered and full reset isn't supported
5882 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5885 * Flush RAM to disk so that after reboot
5886 * the user can read log and see why the system rebooted.
5888 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5889 amdgpu_ras_get_context(adev)->reboot) {
5890 DRM_WARN("Emergency reboot.");
5893 emergency_restart();
5896 dev_info(adev->dev, "GPU %s begin!\n",
5897 need_emergency_restart ? "jobs stop":"reset");
5899 if (!amdgpu_sriov_vf(adev))
5900 hive = amdgpu_get_xgmi_hive(adev);
5902 mutex_lock(&hive->hive_lock);
5904 reset_context->job = job;
5905 reset_context->hive = hive;
5907 * Build list of devices to reset.
5908 * In case we are in XGMI hive mode, resort the device list
5909 * to put adev in the 1st position.
5911 INIT_LIST_HEAD(&device_list);
5912 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5913 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5914 list_add_tail(&tmp_adev->reset_list, &device_list);
5916 tmp_adev->shutdown = true;
5918 if (!list_is_first(&adev->reset_list, &device_list))
5919 list_rotate_to_front(&adev->reset_list, &device_list);
5920 device_list_handle = &device_list;
5922 list_add_tail(&adev->reset_list, &device_list);
5923 device_list_handle = &device_list;
5926 if (!amdgpu_sriov_vf(adev)) {
5927 r = amdgpu_device_health_check(device_list_handle);
5932 /* We need to lock reset domain only once both for XGMI and single device */
5933 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5935 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5937 /* block all schedulers and reset given job's ring */
5938 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5940 amdgpu_device_set_mp1_state(tmp_adev);
5943 * Try to put the audio codec into suspend state
5944 * before gpu reset started.
5946 * Due to the power domain of the graphics device
5947 * is shared with AZ power domain. Without this,
5948 * we may change the audio hardware from behind
5949 * the audio driver's back. That will trigger
5950 * some audio codec errors.
5952 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5953 audio_suspended = true;
5955 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5957 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5959 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5962 * Mark these ASICs to be reset as untracked first
5963 * And add them back after reset completed
5965 amdgpu_unregister_gpu_instance(tmp_adev);
5967 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5969 /* disable ras on ALL IPs */
5970 if (!need_emergency_restart &&
5971 amdgpu_device_ip_need_full_reset(tmp_adev))
5972 amdgpu_ras_suspend(tmp_adev);
5974 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5975 struct amdgpu_ring *ring = tmp_adev->rings[i];
5977 if (!amdgpu_ring_sched_ready(ring))
5980 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5982 if (need_emergency_restart)
5983 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5985 atomic_inc(&tmp_adev->gpu_reset_counter);
5988 if (need_emergency_restart)
5989 goto skip_sched_resume;
5992 * Must check guilty signal here since after this point all old
5993 * HW fences are force signaled.
5995 * job->base holds a reference to parent fence
5997 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5998 job_signaled = true;
5999 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6003 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6004 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6005 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6006 /*TODO Should we stop ?*/
6008 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6009 r, adev_to_drm(tmp_adev)->unique);
6010 tmp_adev->asic_reset_res = r;
6014 /* Actual ASIC resets if needed.*/
6015 /* Host driver will handle XGMI hive reset for SRIOV */
6016 if (amdgpu_sriov_vf(adev)) {
6017 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6018 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6019 amdgpu_ras_set_fed(adev, true);
6020 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6023 r = amdgpu_device_reset_sriov(adev, reset_context);
6024 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6025 amdgpu_virt_release_full_gpu(adev, true);
6029 adev->asic_reset_res = r;
6031 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
6032 if (r && r == -EAGAIN)
6036 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6038 * Drop any pending non scheduler resets queued before reset is done.
6039 * Any reset scheduled after this point would be valid. Scheduler resets
6040 * were already dropped during drm_sched_stop and no new ones can come
6041 * in before drm_sched_start.
6043 amdgpu_device_stop_pending_resets(tmp_adev);
6048 /* Post ASIC reset for all devs .*/
6049 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6051 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6052 struct amdgpu_ring *ring = tmp_adev->rings[i];
6054 if (!amdgpu_ring_sched_ready(ring))
6057 drm_sched_start(&ring->sched, 0);
6060 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6061 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6063 if (tmp_adev->asic_reset_res)
6064 r = tmp_adev->asic_reset_res;
6066 tmp_adev->asic_reset_res = 0;
6069 /* bad news, how to tell it to userspace ?
6070 * for ras error, we should report GPU bad status instead of
6073 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6074 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6075 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6076 atomic_read(&tmp_adev->gpu_reset_counter));
6077 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6079 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6080 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6081 DRM_WARN("smart shift update failed\n");
6086 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6087 /* unlock kfd: SRIOV would do it separately */
6088 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6089 amdgpu_amdkfd_post_reset(tmp_adev);
6091 /* kfd_post_reset will do nothing if kfd device is not initialized,
6092 * need to bring up kfd here if it's not be initialized before
6094 if (!adev->kfd.init_complete)
6095 amdgpu_amdkfd_device_init(adev);
6097 if (audio_suspended)
6098 amdgpu_device_resume_display_audio(tmp_adev);
6100 amdgpu_device_unset_mp1_state(tmp_adev);
6102 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6105 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6107 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6111 mutex_unlock(&hive->hive_lock);
6112 amdgpu_put_xgmi_hive(hive);
6116 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6118 atomic_set(&adev->reset_domain->reset_res, r);
6123 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6125 * @adev: amdgpu_device pointer
6126 * @speed: pointer to the speed of the link
6127 * @width: pointer to the width of the link
6129 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6130 * first physical partner to an AMD dGPU.
6131 * This will exclude any virtual switches and links.
6133 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6134 enum pci_bus_speed *speed,
6135 enum pcie_link_width *width)
6137 struct pci_dev *parent = adev->pdev;
6139 if (!speed || !width)
6142 *speed = PCI_SPEED_UNKNOWN;
6143 *width = PCIE_LNK_WIDTH_UNKNOWN;
6145 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6146 while ((parent = pci_upstream_bridge(parent))) {
6147 /* skip upstream/downstream switches internal to dGPU*/
6148 if (parent->vendor == PCI_VENDOR_ID_ATI)
6150 *speed = pcie_get_speed_cap(parent);
6151 *width = pcie_get_width_cap(parent);
6155 /* use the current speeds rather than max if switching is not supported */
6156 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6161 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6163 * @adev: amdgpu_device pointer
6165 * Fetches and stores in the driver the PCIE capabilities (gen speed
6166 * and lanes) of the slot the device is in. Handles APUs and
6167 * virtualized environments where PCIE config space may not be available.
6169 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6171 struct pci_dev *pdev;
6172 enum pci_bus_speed speed_cap, platform_speed_cap;
6173 enum pcie_link_width platform_link_width;
6175 if (amdgpu_pcie_gen_cap)
6176 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6178 if (amdgpu_pcie_lane_cap)
6179 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6181 /* covers APUs as well */
6182 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6183 if (adev->pm.pcie_gen_mask == 0)
6184 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6185 if (adev->pm.pcie_mlw_mask == 0)
6186 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6190 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6193 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6194 &platform_link_width);
6196 if (adev->pm.pcie_gen_mask == 0) {
6199 speed_cap = pcie_get_speed_cap(pdev);
6200 if (speed_cap == PCI_SPEED_UNKNOWN) {
6201 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6202 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6203 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6205 if (speed_cap == PCIE_SPEED_32_0GT)
6206 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6207 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6208 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6209 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6210 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6211 else if (speed_cap == PCIE_SPEED_16_0GT)
6212 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6213 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6214 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6215 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6216 else if (speed_cap == PCIE_SPEED_8_0GT)
6217 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6218 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6219 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6220 else if (speed_cap == PCIE_SPEED_5_0GT)
6221 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6222 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6224 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6227 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6228 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6229 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6231 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6232 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6233 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6234 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6235 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6236 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6237 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6238 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6239 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6240 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6241 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6242 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6243 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6244 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6245 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6246 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6247 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6248 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6250 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6254 if (adev->pm.pcie_mlw_mask == 0) {
6255 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6256 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6258 switch (platform_link_width) {
6260 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6261 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6262 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6263 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6264 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6265 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6266 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6269 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6270 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6271 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6272 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6273 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6274 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6277 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6278 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6279 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6280 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6281 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6284 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6285 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6286 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6287 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6290 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6291 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6292 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6295 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6296 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6299 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6309 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6311 * @adev: amdgpu_device pointer
6312 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6314 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6315 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6318 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6319 struct amdgpu_device *peer_adev)
6321 #ifdef CONFIG_HSA_AMD_P2P
6323 !adev->gmc.xgmi.connected_to_cpu &&
6324 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6326 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6327 pci_name(peer_adev->pdev));
6329 bool is_large_bar = adev->gmc.visible_vram_size &&
6330 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6331 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6333 if (!p2p_addressable) {
6334 uint64_t address_mask = peer_adev->dev->dma_mask ?
6335 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6336 resource_size_t aper_limit =
6337 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6339 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6340 aper_limit & address_mask);
6342 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6348 int amdgpu_device_baco_enter(struct drm_device *dev)
6350 struct amdgpu_device *adev = drm_to_adev(dev);
6351 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6353 if (!amdgpu_device_supports_baco(dev))
6356 if (ras && adev->ras_enabled &&
6357 adev->nbio.funcs->enable_doorbell_interrupt)
6358 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6360 return amdgpu_dpm_baco_enter(adev);
6363 int amdgpu_device_baco_exit(struct drm_device *dev)
6365 struct amdgpu_device *adev = drm_to_adev(dev);
6366 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6369 if (!amdgpu_device_supports_baco(dev))
6372 ret = amdgpu_dpm_baco_exit(adev);
6376 if (ras && adev->ras_enabled &&
6377 adev->nbio.funcs->enable_doorbell_interrupt)
6378 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6380 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6381 adev->nbio.funcs->clear_doorbell_interrupt)
6382 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6388 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6389 * @pdev: PCI device struct
6390 * @state: PCI channel state
6392 * Description: Called when a PCI error is detected.
6394 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6396 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6398 struct drm_device *dev = pci_get_drvdata(pdev);
6399 struct amdgpu_device *adev = drm_to_adev(dev);
6402 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6404 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6405 DRM_WARN("No support for XGMI hive yet...");
6406 return PCI_ERS_RESULT_DISCONNECT;
6409 adev->pci_channel_state = state;
6412 case pci_channel_io_normal:
6413 return PCI_ERS_RESULT_CAN_RECOVER;
6414 /* Fatal error, prepare for slot reset */
6415 case pci_channel_io_frozen:
6417 * Locking adev->reset_domain->sem will prevent any external access
6418 * to GPU during PCI error recovery
6420 amdgpu_device_lock_reset_domain(adev->reset_domain);
6421 amdgpu_device_set_mp1_state(adev);
6424 * Block any work scheduling as we do for regular GPU reset
6425 * for the duration of the recovery
6427 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6428 struct amdgpu_ring *ring = adev->rings[i];
6430 if (!amdgpu_ring_sched_ready(ring))
6433 drm_sched_stop(&ring->sched, NULL);
6435 atomic_inc(&adev->gpu_reset_counter);
6436 return PCI_ERS_RESULT_NEED_RESET;
6437 case pci_channel_io_perm_failure:
6438 /* Permanent error, prepare for device removal */
6439 return PCI_ERS_RESULT_DISCONNECT;
6442 return PCI_ERS_RESULT_NEED_RESET;
6446 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6447 * @pdev: pointer to PCI device
6449 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6452 DRM_INFO("PCI error: mmio enabled callback!!\n");
6454 /* TODO - dump whatever for debugging purposes */
6456 /* This called only if amdgpu_pci_error_detected returns
6457 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6458 * works, no need to reset slot.
6461 return PCI_ERS_RESULT_RECOVERED;
6465 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6466 * @pdev: PCI device struct
6468 * Description: This routine is called by the pci error recovery
6469 * code after the PCI slot has been reset, just before we
6470 * should resume normal operations.
6472 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6474 struct drm_device *dev = pci_get_drvdata(pdev);
6475 struct amdgpu_device *adev = drm_to_adev(dev);
6477 struct amdgpu_reset_context reset_context;
6479 struct list_head device_list;
6481 /* PCI error slot reset should be skipped During RAS recovery */
6482 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6483 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6484 amdgpu_ras_in_recovery(adev))
6485 return PCI_ERS_RESULT_RECOVERED;
6487 DRM_INFO("PCI error: slot reset callback!!\n");
6489 memset(&reset_context, 0, sizeof(reset_context));
6491 INIT_LIST_HEAD(&device_list);
6492 list_add_tail(&adev->reset_list, &device_list);
6494 /* wait for asic to come out of reset */
6497 /* Restore PCI confspace */
6498 amdgpu_device_load_pci_state(pdev);
6500 /* confirm ASIC came out of reset */
6501 for (i = 0; i < adev->usec_timeout; i++) {
6502 memsize = amdgpu_asic_get_config_memsize(adev);
6504 if (memsize != 0xffffffff)
6508 if (memsize == 0xffffffff) {
6513 reset_context.method = AMD_RESET_METHOD_NONE;
6514 reset_context.reset_req_dev = adev;
6515 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6516 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6518 adev->no_hw_access = true;
6519 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6520 adev->no_hw_access = false;
6524 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6528 if (amdgpu_device_cache_pci_state(adev->pdev))
6529 pci_restore_state(adev->pdev);
6531 DRM_INFO("PCIe error recovery succeeded\n");
6533 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6534 amdgpu_device_unset_mp1_state(adev);
6535 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6538 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6542 * amdgpu_pci_resume() - resume normal ops after PCI reset
6543 * @pdev: pointer to PCI device
6545 * Called when the error recovery driver tells us that its
6546 * OK to resume normal operation.
6548 void amdgpu_pci_resume(struct pci_dev *pdev)
6550 struct drm_device *dev = pci_get_drvdata(pdev);
6551 struct amdgpu_device *adev = drm_to_adev(dev);
6555 DRM_INFO("PCI error: resume callback!!\n");
6557 /* Only continue execution for the case of pci_channel_io_frozen */
6558 if (adev->pci_channel_state != pci_channel_io_frozen)
6561 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6562 struct amdgpu_ring *ring = adev->rings[i];
6564 if (!amdgpu_ring_sched_ready(ring))
6567 drm_sched_start(&ring->sched, 0);
6570 amdgpu_device_unset_mp1_state(adev);
6571 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6574 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6576 struct drm_device *dev = pci_get_drvdata(pdev);
6577 struct amdgpu_device *adev = drm_to_adev(dev);
6580 if (amdgpu_sriov_vf(adev))
6583 r = pci_save_state(pdev);
6585 kfree(adev->pci_state);
6587 adev->pci_state = pci_store_saved_state(pdev);
6589 if (!adev->pci_state) {
6590 DRM_ERROR("Failed to store PCI saved state");
6594 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6601 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6603 struct drm_device *dev = pci_get_drvdata(pdev);
6604 struct amdgpu_device *adev = drm_to_adev(dev);
6607 if (!adev->pci_state)
6610 r = pci_load_saved_state(pdev, adev->pci_state);
6613 pci_restore_state(pdev);
6615 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6622 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6623 struct amdgpu_ring *ring)
6625 #ifdef CONFIG_X86_64
6626 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6629 if (adev->gmc.xgmi.connected_to_cpu)
6632 if (ring && ring->funcs->emit_hdp_flush)
6633 amdgpu_ring_emit_hdp_flush(ring);
6635 amdgpu_asic_flush_hdp(adev, ring);
6638 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6639 struct amdgpu_ring *ring)
6641 #ifdef CONFIG_X86_64
6642 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6645 if (adev->gmc.xgmi.connected_to_cpu)
6648 amdgpu_asic_invalidate_hdp(adev, ring);
6651 int amdgpu_in_reset(struct amdgpu_device *adev)
6653 return atomic_read(&adev->reset_domain->in_gpu_reset);
6657 * amdgpu_device_halt() - bring hardware to some kind of halt state
6659 * @adev: amdgpu_device pointer
6661 * Bring hardware to some kind of halt state so that no one can touch it
6662 * any more. It will help to maintain error context when error occurred.
6663 * Compare to a simple hang, the system will keep stable at least for SSH
6664 * access. Then it should be trivial to inspect the hardware state and
6665 * see what's going on. Implemented as following:
6667 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6668 * clears all CPU mappings to device, disallows remappings through page faults
6669 * 2. amdgpu_irq_disable_all() disables all interrupts
6670 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6671 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6672 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6673 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6674 * flush any in flight DMA operations
6676 void amdgpu_device_halt(struct amdgpu_device *adev)
6678 struct pci_dev *pdev = adev->pdev;
6679 struct drm_device *ddev = adev_to_drm(adev);
6681 amdgpu_xcp_dev_unplug(adev);
6682 drm_dev_unplug(ddev);
6684 amdgpu_irq_disable_all(adev);
6686 amdgpu_fence_driver_hw_fini(adev);
6688 adev->no_hw_access = true;
6690 amdgpu_device_unmap_mmio(adev);
6692 pci_disable_device(pdev);
6693 pci_wait_for_pending_transaction(pdev);
6696 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6699 unsigned long flags, address, data;
6702 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6703 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6705 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6706 WREG32(address, reg * 4);
6707 (void)RREG32(address);
6709 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6713 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6716 unsigned long flags, address, data;
6718 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6719 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6721 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6722 WREG32(address, reg * 4);
6723 (void)RREG32(address);
6726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6730 * amdgpu_device_get_gang - return a reference to the current gang
6731 * @adev: amdgpu_device pointer
6733 * Returns: A new reference to the current gang leader.
6735 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6737 struct dma_fence *fence;
6740 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6746 * amdgpu_device_switch_gang - switch to a new gang
6747 * @adev: amdgpu_device pointer
6748 * @gang: the gang to switch to
6750 * Try to switch to a new gang.
6751 * Returns: NULL if we switched to the new gang or a reference to the current
6754 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6755 struct dma_fence *gang)
6757 struct dma_fence *old = NULL;
6761 old = amdgpu_device_get_gang(adev);
6765 if (!dma_fence_is_signaled(old))
6768 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6775 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6777 switch (adev->asic_type) {
6778 #ifdef CONFIG_DRM_AMDGPU_SI
6782 /* chips with no display hardware */
6784 #ifdef CONFIG_DRM_AMDGPU_SI
6790 #ifdef CONFIG_DRM_AMDGPU_CIK
6799 case CHIP_POLARIS10:
6800 case CHIP_POLARIS11:
6801 case CHIP_POLARIS12:
6805 /* chips with display hardware */
6809 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6810 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6816 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6817 uint32_t inst, uint32_t reg_addr, char reg_name[],
6818 uint32_t expected_value, uint32_t mask)
6822 uint32_t tmp_ = RREG32(reg_addr);
6823 uint32_t loop = adev->usec_timeout;
6825 while ((tmp_ & (mask)) != (expected_value)) {
6827 loop = adev->usec_timeout;
6831 tmp_ = RREG32(reg_addr);
6834 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6835 inst, reg_name, (uint32_t)expected_value,
6836 (uint32_t)(tmp_ & (mask)));
6844 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6848 if (!ring || !ring->adev)
6851 if (amdgpu_device_should_recover_gpu(ring->adev))
6852 size |= AMDGPU_RESET_TYPE_FULL;
6854 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6855 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6856 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6861 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6865 if (supported_reset == 0) {
6866 size += sysfs_emit_at(buf, size, "unsupported");
6867 size += sysfs_emit_at(buf, size, "\n");
6872 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6873 size += sysfs_emit_at(buf, size, "soft ");
6875 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6876 size += sysfs_emit_at(buf, size, "queue ");
6878 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6879 size += sysfs_emit_at(buf, size, "pipe ");
6881 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6882 size += sysfs_emit_at(buf, size, "full ");
6884 size += sysfs_emit_at(buf, size, "\n");